diff --git a/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp b/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp
index 78b0a464269541..691f9973ce2626 100644
--- a/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp
+++ b/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp
@@ -1609,7 +1609,21 @@ void MyICJI::allocMem(ULONG              hotCodeSize,   /* IN */
     jitInstance->mc->cr->AddCall("allocMem");
 
     // TODO-Cleanup: Could hot block size be ever 0?
-    *hotCodeBlock = jitInstance->mc->cr->allocateMemory(hotCodeSize);
+    size_t codeAlignment      = sizeof(void*);
+    size_t hotCodeAlignedSize = static_cast<size_t>(hotCodeSize);
+
+    if ((flag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0)
+    {
+         codeAlignment = 32;
+    }
+    else if ((flag & CORJIT_ALLOCMEM_FLG_16BYTE_ALIGN) != 0)
+    {
+         codeAlignment = 16;
+    }
+    hotCodeAlignedSize = ALIGN_UP_SPMI(hotCodeAlignedSize, codeAlignment);
+    hotCodeAlignedSize = hotCodeAlignedSize + (codeAlignment - sizeof(void*));
+    *hotCodeBlock      = jitInstance->mc->cr->allocateMemory(hotCodeAlignedSize);
+    *hotCodeBlock      = ALIGN_UP_SPMI(*hotCodeBlock, codeAlignment);
 
     if (coldCodeSize > 0)
         *coldCodeBlock = jitInstance->mc->cr->allocateMemory(coldCodeSize);
diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h
index fb0d859f8db8b7..6ddd274ac9fce8 100644
--- a/src/coreclr/inc/clrconfigvalues.h
+++ b/src/coreclr/inc/clrconfigvalues.h
@@ -302,7 +302,6 @@ RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_UseIBCFile, W("UseIBCFile"), 0, "", CLRConf
 ///
 /// JIT
 ///
-RETAIL_CONFIG_DWORD_INFO_DIRECT_ACCESS(UNSUPPORTED_JitAlignLoops, W("JitAlignLoops"), "Aligns loop targets to 8 byte boundaries")
 CONFIG_DWORD_INFO_EX(INTERNAL_JitBreakEmit, W("JitBreakEmit"), (DWORD)-1, "", CLRConfig::EEConfig_default)
 CONFIG_DWORD_INFO_DIRECT_ACCESS(INTERNAL_JitDebuggable, W("JitDebuggable"), "")
 #if !defined(DEBUG) && !defined(_DEBUG)
diff --git a/src/coreclr/inc/corjitflags.h b/src/coreclr/inc/corjitflags.h
index 83cbc20be8863a..5cea8a224c609d 100644
--- a/src/coreclr/inc/corjitflags.h
+++ b/src/coreclr/inc/corjitflags.h
@@ -79,45 +79,45 @@ class CORJIT_FLAGS
         CORJIT_FLAG_BBINSTR                 = 29, // Collect basic block profile information
         CORJIT_FLAG_BBOPT                   = 30, // Optimize method based on profile information
         CORJIT_FLAG_FRAMED                  = 31, // All methods have an EBP frame
-        CORJIT_FLAG_ALIGN_LOOPS             = 32, // add NOPs before loops to align them at 16 byte boundaries
+        CORJIT_FLAG_UNUSED12                = 32,
         CORJIT_FLAG_PUBLISH_SECRET_PARAM    = 33, // JIT must place stub secret param into local 0.  (used by IL stubs)
-        CORJIT_FLAG_UNUSED12                = 34,
+        CORJIT_FLAG_UNUSED13                = 34,
         CORJIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background
         CORJIT_FLAG_USE_PINVOKE_HELPERS     = 36, // The JIT should use the PINVOKE_{BEGIN,END} helpers instead of emitting inline transitions
         CORJIT_FLAG_REVERSE_PINVOKE         = 37, // The JIT should insert REVERSE_PINVOKE_{ENTER,EXIT} helpers into method prolog/epilog
-        CORJIT_FLAG_UNUSED13                = 38,
+        CORJIT_FLAG_UNUSED14                = 38,
         CORJIT_FLAG_TIER0                   = 39, // This is the initial tier for tiered compilation which should generate code as quickly as possible
         CORJIT_FLAG_TIER1                   = 40, // This is the final tier (for now) for tiered compilation which should generate high quality code
 
 #if defined(TARGET_ARM)
         CORJIT_FLAG_RELATIVE_CODE_RELOCS    = 41, // JIT should generate PC-relative address computations instead of EE relocation records
 #else // !defined(TARGET_ARM)
-        CORJIT_FLAG_UNUSED14                = 41,
+        CORJIT_FLAG_UNUSED15                = 41,
 #endif // !defined(TARGET_ARM)
 
         CORJIT_FLAG_NO_INLINING             = 42, // JIT should not inline any called method into this method
 
-        CORJIT_FLAG_UNUSED15                = 43,
-        CORJIT_FLAG_UNUSED16                = 44,
-        CORJIT_FLAG_UNUSED17                = 45,
-        CORJIT_FLAG_UNUSED18                = 46,
-        CORJIT_FLAG_UNUSED19                = 47,
-        CORJIT_FLAG_UNUSED20                = 48,
-        CORJIT_FLAG_UNUSED21                = 49,
-        CORJIT_FLAG_UNUSED22                = 50,
-        CORJIT_FLAG_UNUSED23                = 51,
-        CORJIT_FLAG_UNUSED24                = 52,
-        CORJIT_FLAG_UNUSED25                = 53,
-        CORJIT_FLAG_UNUSED26                = 54,
-        CORJIT_FLAG_UNUSED27                = 55,
-        CORJIT_FLAG_UNUSED28                = 56,
-        CORJIT_FLAG_UNUSED29                = 57,
-        CORJIT_FLAG_UNUSED30                = 58,
-        CORJIT_FLAG_UNUSED31                = 59,
-        CORJIT_FLAG_UNUSED32                = 60,
-        CORJIT_FLAG_UNUSED33                = 61,
-        CORJIT_FLAG_UNUSED34                = 62,
-        CORJIT_FLAG_UNUSED35                = 63
+        CORJIT_FLAG_UNUSED16                = 43,
+        CORJIT_FLAG_UNUSED17                = 44,
+        CORJIT_FLAG_UNUSED18                = 45,
+        CORJIT_FLAG_UNUSED19                = 46,
+        CORJIT_FLAG_UNUSED20                = 47,
+        CORJIT_FLAG_UNUSED21                = 48,
+        CORJIT_FLAG_UNUSED22                = 49,
+        CORJIT_FLAG_UNUSED23                = 50,
+        CORJIT_FLAG_UNUSED24                = 51,
+        CORJIT_FLAG_UNUSED25                = 52,
+        CORJIT_FLAG_UNUSED26                = 53,
+        CORJIT_FLAG_UNUSED27                = 54,
+        CORJIT_FLAG_UNUSED28                = 55,
+        CORJIT_FLAG_UNUSED29                = 56,
+        CORJIT_FLAG_UNUSED30                = 57,
+        CORJIT_FLAG_UNUSED31                = 58,
+        CORJIT_FLAG_UNUSED32                = 59,
+        CORJIT_FLAG_UNUSED33                = 60,
+        CORJIT_FLAG_UNUSED34                = 61,
+        CORJIT_FLAG_UNUSED35                = 62,
+        CORJIT_FLAG_UNUSED36                = 63
     };
 
     CORJIT_FLAGS()
diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h
index 6ee29b5a00fae6..e67969b5222d5a 100644
--- a/src/coreclr/inc/jiteeversionguid.h
+++ b/src/coreclr/inc/jiteeversionguid.h
@@ -31,11 +31,11 @@
 //
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-constexpr GUID JITEEVersionIdentifier = { /* 8e32c24d-62fe-4d78-ae73-eedddb928ee2 */
-    0x8e32c24d,
-    0x62fe,
-    0x4d78,
-    {0xae, 0x73, 0xee, 0xdd, 0xdb, 0x92, 0x8e, 0xe2}
+constexpr GUID JITEEVersionIdentifier = { /* de81f48e-7701-45f2-a91b-1914f88dfd11 */
+    0xde81f48e,
+    0x7701,
+    0x45f2,
+    {0xa9, 0x1b, 0x19, 0x14, 0xf8, 0x8d, 0xfd, 0x11}
 };
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp
index f2b14599335f5f..6cea8dd2c367a2 100644
--- a/src/coreclr/jit/block.cpp
+++ b/src/coreclr/jit/block.cpp
@@ -505,6 +505,10 @@ void BasicBlock::dspFlags()
     {
         printf("cfe ");
     }
+    if (bbFlags & BBF_LOOP_ALIGN)
+    {
+        printf("align ");
+    }
 }
 
 /*****************************************************************************
diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h
index 02c37361e831ce..d92f5b2c3550c1 100644
--- a/src/coreclr/jit/block.h
+++ b/src/coreclr/jit/block.h
@@ -448,6 +448,7 @@ struct BasicBlock : private LIR::Range
 
 #define BBF_PATCHPOINT                     MAKE_BBFLAG(36) // Block is a patchpoint
 #define BBF_HAS_CLASS_PROFILE              MAKE_BBFLAG(37) // BB contains a call needing a class profile
+#define BBF_LOOP_ALIGN                     MAKE_BBFLAG(39) // Block is lexically the first block in a loop we intend to align.
 
 // clang-format on
 
@@ -463,6 +464,10 @@ struct BasicBlock : private LIR::Range
     {
         return ((bbFlags & BBF_LOOP_HEAD) != 0);
     }
+    bool isLoopAlign() const
+    {
+        return ((bbFlags & BBF_LOOP_ALIGN) != 0);
+    }
 
 // Flags to update when two blocks are compacted
 
diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index d6eebc9d416152..8c4572dcec43f5 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -2258,6 +2258,12 @@ void CodeGen::genGenerateMachineCode()
 
     GetEmitter()->emitJumpDistBind();
 
+#if FEATURE_LOOP_ALIGN
+    /* Perform alignment adjustments */
+
+    GetEmitter()->emitLoopAlignAdjustments();
+#endif
+
     /* The code is now complete and final; it should not change after this. */
 }
 
@@ -2345,10 +2351,12 @@ void CodeGen::genEmitMachineCode()
 #ifdef DEBUG
     if (compiler->opts.disAsm || verbose)
     {
-        printf("\n; Total bytes of code %d, prolog size %d, PerfScore %.2f, instruction count %d (MethodHash=%08x) for "
+        printf("\n; Total bytes of code %d, prolog size %d, PerfScore %.2f, instruction count %d, allocated bytes for "
+               "code %d (MethodHash=%08x) for "
                "method %s\n",
-               codeSize, prologSize, compiler->info.compPerfScore, instrCount, compiler->info.compMethodHash(),
-               compiler->info.compFullName);
+               codeSize, prologSize, compiler->info.compPerfScore, instrCount,
+               GetEmitter()->emitTotalHotCodeSize + GetEmitter()->emitTotalColdCodeSize,
+               compiler->info.compMethodHash(), compiler->info.compFullName);
         printf("; ============================================================\n\n");
         printf(""); // in our logic this causes a flush
     }
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index bf8d1ce087adf9..215e3c04f75b59 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -311,13 +311,6 @@ void CodeGen::genCodeForBBlist()
 
         genUpdateCurrentFunclet(block);
 
-#ifdef TARGET_XARCH
-        if (ShouldAlignLoops() && block->bbFlags & BBF_LOOP_HEAD)
-        {
-            GetEmitter()->emitLoopAlign();
-        }
-#endif
-
         genLogLabel(block);
 
         // Tell everyone which basic block we're working on
@@ -356,6 +349,14 @@ void CodeGen::genCodeForBBlist()
             needLabel = true;
         }
 
+#if FEATURE_LOOP_ALIGN
+        if (GetEmitter()->emitEndsWithAlignInstr())
+        {
+            // we had better be planning on starting a new IG
+            assert(needLabel);
+        }
+#endif
+
         if (needLabel)
         {
             // Mark a label and update the current set of live GC refs
@@ -667,10 +668,6 @@ void CodeGen::genCodeForBBlist()
 
         switch (block->bbJumpKind)
         {
-            case BBJ_ALWAYS:
-                inst_JMP(EJ_jmp, block->bbJumpDest);
-                break;
-
             case BBJ_RETURN:
                 genExitCode(block);
                 break;
@@ -741,15 +738,55 @@ void CodeGen::genCodeForBBlist()
 #endif // !FEATURE_EH_FUNCLETS
 
             case BBJ_NONE:
-            case BBJ_COND:
             case BBJ_SWITCH:
                 break;
 
+            case BBJ_ALWAYS:
+                inst_JMP(EJ_jmp, block->bbJumpDest);
+                FALLTHROUGH;
+
+            case BBJ_COND:
+
+#if FEATURE_LOOP_ALIGN
+                // This is the last place where we operate on blocks and after this, we operate
+                // on IG. Hence, if we know that the destination of "block" is the first block
+                // of a loop and needs alignment (it has BBF_LOOP_ALIGN), then "block" represents
+                // end of the loop. Propagate that information on the IG through "igLoopBackEdge".
+                //
+                // During emitter, this information will be used to calculate the loop size.
+                // Depending on the loop size, decision of whether to align a loop or not will be taken.
+
+                if (block->bbJumpDest->isLoopAlign())
+                {
+                    GetEmitter()->emitSetLoopBackEdge(block->bbJumpDest);
+                }
+#endif
+                break;
+
             default:
                 noway_assert(!"Unexpected bbJumpKind");
                 break;
         }
 
+#if FEATURE_LOOP_ALIGN
+
+        // If next block is the first block of a loop (identified by BBF_LOOP_ALIGN),
+        // then need to add align instruction in current "block". Also mark the
+        // corresponding IG with IGF_LOOP_ALIGN to know that there will be align
+        // instructions at the end of that IG.
+        //
+        // For non-adaptive alignment, add alignment instruction of size depending on the
+        // compJitAlignLoopBoundary.
+        // For adaptive alignment, alignment instruction will always be of 15 bytes.
+
+        if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign()))
+        {
+            assert(ShouldAlignLoops());
+
+            GetEmitter()->emitLoopAlignment();
+        }
+#endif
+
 #if defined(DEBUG) && defined(USING_VARIABLE_LIVE_RANGE)
         if (compiler->verbose)
         {
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index b25afff5a6fdaa..7f53629f25496f 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2308,7 +2308,7 @@ void Compiler::compSetProcessor()
     opts.compUseCMOV = jitFlags.IsSet(JitFlags::JIT_FLAG_USE_CMOV);
 #ifdef DEBUG
     if (opts.compUseCMOV)
-        opts.compUseCMOV = !compStressCompile(STRESS_USE_CMOV, 50);
+        opts.compUseCMOV                = !compStressCompile(STRESS_USE_CMOV, 50);
 #endif // DEBUG
 
 #endif // TARGET_X86
@@ -2615,6 +2615,29 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     opts.compDbgInfo = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_INFO);
     opts.compDbgEnC  = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_EnC);
 
+#ifdef DEBUG
+    opts.compJitAlignLoopAdaptive       = JitConfig.JitAlignLoopAdaptive() == 1;
+    opts.compJitAlignLoopBoundary       = (unsigned short)JitConfig.JitAlignLoopBoundary();
+    opts.compJitAlignLoopMinBlockWeight = (unsigned short)JitConfig.JitAlignLoopMinBlockWeight();
+
+    opts.compJitAlignLoopForJcc      = JitConfig.JitAlignLoopForJcc() == 1;
+    opts.compJitAlignLoopMaxCodeSize = (unsigned short)JitConfig.JitAlignLoopMaxCodeSize();
+#else
+    opts.compJitAlignLoopAdaptive       = true;
+    opts.compJitAlignLoopBoundary       = DEFAULT_ALIGN_LOOP_BOUNDARY;
+    opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT;
+#endif
+    if (opts.compJitAlignLoopAdaptive)
+    {
+        opts.compJitAlignPaddingLimit = (opts.compJitAlignLoopBoundary >> 1) - 1;
+    }
+    else
+    {
+        opts.compJitAlignPaddingLimit = opts.compJitAlignLoopBoundary - 1;
+    }
+
+    assert(isPow2(opts.compJitAlignLoopBoundary));
+
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
     // We never want to have debugging enabled when regenerating GC encoding patterns
     opts.compDbgCode = false;
@@ -3913,19 +3936,17 @@ void Compiler::compSetOptimizationLevel()
             codeGen->setFrameRequired(true);
 #endif
 
-        if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_RELOC))
+        if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT))
         {
-            codeGen->SetAlignLoops(false); // loop alignment not supported for prejitted code
-
-            // The zapper doesn't set JitFlags::JIT_FLAG_ALIGN_LOOPS, and there is
-            // no reason for it to set it as the JIT doesn't currently support loop alignment
-            // for prejitted images. (The JIT doesn't know the final address of the code, hence
+            // The JIT doesn't currently support loop alignment for prejitted images.
+            // (The JIT doesn't know the final address of the code, hence
             // it can't align code based on unknown addresses.)
-            assert(!opts.jitFlags->IsSet(JitFlags::JIT_FLAG_ALIGN_LOOPS));
+
+            codeGen->SetAlignLoops(false); // loop alignment not supported for prejitted code
         }
         else
         {
-            codeGen->SetAlignLoops(opts.jitFlags->IsSet(JitFlags::JIT_FLAG_ALIGN_LOOPS));
+            codeGen->SetAlignLoops(JitConfig.JitAlignLoops() == 1);
         }
     }
 
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 1cdebfb9c3c8aa..9af31fdf03a071 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -6367,6 +6367,8 @@ class Compiler
 
     void optFindNaturalLoops();
 
+    void optIdentifyLoopsForAlignment();
+
     // Ensures that all the loops in the loop nest rooted at "loopInd" (an index into the loop table) are 'canonical' --
     // each loop has a unique "top."  Returns "true" iff the flowgraph has been modified.
     bool optCanonicalizeLoopNest(unsigned char loopInd);
@@ -9036,6 +9038,43 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
         bool dspGCtbls;       // Display the GC tables
 #endif
 
+// Default numbers used to perform loop alignment. All the numbers are choosen
+// based on experimenting with various benchmarks.
+
+// Default minimum loop block weight required to enable loop alignment.
+#define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 4
+
+// By default a loop will be aligned at 32B address boundary to get better
+// performance as per architecture manuals.
+#define DEFAULT_ALIGN_LOOP_BOUNDARY 0x20
+
+// For non-adaptive loop alignment, by default, only align a loop whose size is
+// at most 3 times the alignment block size. If the loop is bigger than that, it is most
+// likely complicated enough that loop alignment will not impact performance.
+#define DEFAULT_MAX_LOOPSIZE_FOR_ALIGN DEFAULT_ALIGN_LOOP_BOUNDARY * 3
+
+#ifdef DEBUG
+        // Loop alignment variables
+
+        // If set, for non-adaptive alignment, ensure loop jmps are not on or cross alignment boundary.
+        bool compJitAlignLoopForJcc;
+#endif
+        // For non-adaptive alignment, minimum loop size (in bytes) for which alignment will be done.
+        unsigned short compJitAlignLoopMaxCodeSize;
+
+        // Minimum weight needed for the first block of a loop to make it a candidate for alignment.
+        unsigned short compJitAlignLoopMinBlockWeight;
+
+        // For non-adaptive alignment, address boundary (power of 2) at which loop alignment should
+        // be done. By default, 32B.
+        unsigned short compJitAlignLoopBoundary;
+
+        // Padding limit to align a loop.
+        unsigned short compJitAlignPaddingLimit;
+
+        // If set, perform adaptive loop alignment that limits number of padding based on loop size.
+        bool compJitAlignLoopAdaptive;
+
 #ifdef LATE_DISASM
         bool doLateDisasm; // Run the late disassembler
 #endif                     // LATE_DISASM
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 3056c71e6a0932..b42111611504d7 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -160,6 +160,8 @@ unsigned emitter::emitSmallCnsCnt;
 unsigned emitter::emitLargeCnsCnt;
 unsigned emitter::emitSmallCns[SMALL_CNS_TSZ];
 
+unsigned emitter::emitTotalDescAlignCnt;
+
 void emitterStaticStats(FILE* fout)
 {
     // insGroup members
@@ -387,6 +389,9 @@ void emitterStats(FILE* fout)
         fprintf(fout, "Total instrDescReloc:  %8u (%5.2f%%)\n", emitter::emitTotalIDescRelocCnt,
                 100.0 * emitter::emitTotalIDescRelocCnt / emitter::emitTotalInsCnt);
 #endif // TARGET_ARM
+        fprintf(fout, "Total emitTotalDescAlignCnt:  %8u (%5.2f%%)\n", emitter::emitTotalDescAlignCnt,
+                100.0 * emitter::emitTotalDescAlignCnt / emitter::emitTotalInsCnt);
+
         fprintf(fout, "\n");
     }
 
@@ -636,6 +641,10 @@ void emitter::emitGenIG(insGroup* ig)
 
     assert(emitCurIGjmpList == nullptr);
 
+#if FEATURE_LOOP_ALIGN
+    assert(emitCurIGAlignList == nullptr);
+#endif
+
     /* Allocate the temp instruction buffer if we haven't done so */
 
     if (emitCurIGfreeBase == nullptr)
@@ -822,6 +831,60 @@ insGroup* emitter::emitSavIG(bool emitAdd)
     }
 #endif
 
+#if FEATURE_LOOP_ALIGN
+    // Did we have any align instructions in this group?
+    if (emitCurIGAlignList)
+    {
+        instrDescAlign* list = nullptr;
+        instrDescAlign* last = nullptr;
+
+        // Move align instructions to the global list, update their 'next' links
+        do
+        {
+            // Grab the jump and remove it from the list
+
+            instrDescAlign* oa = emitCurIGAlignList;
+            emitCurIGAlignList = oa->idaNext;
+
+            // Figure out the address of where the align got copied
+
+            size_t          of = (BYTE*)oa - emitCurIGfreeBase;
+            instrDescAlign* na = (instrDescAlign*)(ig->igData + of);
+
+            assert(na->idaIG == ig);
+            assert(na->idIns() == oa->idIns());
+            assert(na->idaNext == oa->idaNext);
+            assert(na->idIns() == INS_align);
+
+            na->idaNext = list;
+            list        = na;
+
+            if (last == nullptr)
+            {
+                last = na;
+            }
+        } while (emitCurIGAlignList);
+
+        // Should have at least one align instruction
+        assert(last);
+
+        if (emitAlignList == nullptr)
+        {
+            assert(emitAlignLast == nullptr);
+
+            last->idaNext = emitAlignList;
+            emitAlignList = list;
+        }
+        else
+        {
+            last->idaNext          = nullptr;
+            emitAlignLast->idaNext = list;
+        }
+
+        emitAlignLast = last;
+    }
+
+#endif
     // Did we have any jumps in this group?
 
     if (emitCurIGjmpList)
@@ -933,6 +996,12 @@ void emitter::emitBegFN(bool hasFramePtr
     emitCurIGfreeBase = nullptr;
     emitIGbuffSize    = 0;
 
+#if FEATURE_LOOP_ALIGN
+    emitLastAlignedIgNum        = 0;
+    emitLastInnerLoopStartIgNum = 0;
+    emitLastInnerLoopEndIgNum   = 0;
+#endif
+
     /* Record stack frame info (the temp size is just an estimate) */
 
     emitHasFramePtr = hasFramePtr;
@@ -968,6 +1037,13 @@ void emitter::emitBegFN(bool hasFramePtr
     emitNoGCIG     = false;
     emitForceNewIG = false;
 
+#if FEATURE_LOOP_ALIGN
+    /* We don't have any align instructions */
+
+    emitAlignList = emitAlignLast = nullptr;
+    emitCurIGAlignList            = nullptr;
+#endif
+
     /* We have not recorded any live sets */
 
     assert(VarSetOps::IsEmpty(emitComp, emitThisGCrefVars));
@@ -3613,6 +3689,10 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
 {
     size_t is;
 
+#ifdef DEBUG
+    size_t beforeAddr = (size_t)*dp;
+#endif
+
     /* Record the beginning offset of the instruction */
 
     BYTE* curInsAdr = *dp;
@@ -3647,17 +3727,23 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
 
     /* Did the size of the instruction match our expectations? */
 
-    UNATIVE_OFFSET csz = (UNATIVE_OFFSET)(*dp - curInsAdr);
+    UNATIVE_OFFSET actualSize = (UNATIVE_OFFSET)(*dp - curInsAdr);
 
-    if (csz != id->idCodeSize())
+    unsigned estimatedSize = id->idCodeSize();
+    if (actualSize != estimatedSize)
     {
-        /* It is fatal to under-estimate the instruction size */
-        noway_assert(id->idCodeSize() >= csz);
+        // It is fatal to under-estimate the instruction size, except for alignment instructions
+        noway_assert(estimatedSize >= actualSize);
+
+#if FEATURE_LOOP_ALIGN
+        // Should never over-estimate align instruction or any instruction before the last align instruction of a method
+        assert(id->idIns() != INS_align && emitCurIG->igNum > emitLastAlignedIgNum);
+#endif
 
 #if DEBUG_EMIT
         if (EMITVERBOSE)
         {
-            printf("Instruction predicted size = %u, actual = %u\n", id->idCodeSize(), csz);
+            printf("Instruction predicted size = %u, actual = %u\n", estimatedSize, actualSize);
         }
 #endif // DEBUG_EMIT
 
@@ -3665,7 +3751,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
 
         ig->igFlags |= IGF_UPD_ISZ;
 #if defined(TARGET_XARCH)
-        id->idCodeSize(csz);
+        id->idCodeSize(actualSize);
 #elif defined(TARGET_ARM)
 // This is done as part of emitSetShortJump();
 // insSize isz = emitInsSize(id->idInsFmt());
@@ -3684,6 +3770,51 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
                id->idDebugOnlyInfo()->idNum, is, emitSizeOfInsDsc(id));
         assert(is == emitSizeOfInsDsc(id));
     }
+
+    // Print the alignment boundary
+    if ((emitComp->opts.disAsm || emitComp->verbose) && emitComp->opts.disAddr)
+    {
+        size_t currAddr         = (size_t)*dp;
+        size_t lastBoundaryAddr = currAddr & ~((size_t)emitComp->opts.compJitAlignLoopBoundary - 1);
+
+        // draw boundary if beforeAddr was before the lastBoundary.
+        if (beforeAddr < lastBoundaryAddr)
+        {
+            printf("; ");
+            instruction currIns = id->idIns();
+
+#if defined(TARGET_XARCH)
+
+            // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
+            bool isJccAffectedIns =
+                ((currIns >= INS_i_jmp && currIns < INS_align) || (currIns == INS_call) || (currIns == INS_ret));
+
+            instrDesc* nextId = id;
+            castto(nextId, BYTE*) += is;
+            instruction nextIns = nextId->idIns();
+            if ((currIns == INS_cmp) || (currIns == INS_test) || (currIns == INS_add) || (currIns == INS_sub) ||
+                (currIns == INS_and) || (currIns == INS_inc) || (currIns == INS_dec))
+            {
+                isJccAffectedIns |= (nextIns >= INS_i_jmp && nextIns < INS_align);
+            }
+#else
+            bool isJccAffectedIns = false;
+#endif
+
+            // Indicate if instruction is at at 32B boundary or is splitted
+            unsigned bytesCrossedBoundary = (currAddr & (emitComp->opts.compJitAlignLoopBoundary - 1));
+            if ((bytesCrossedBoundary != 0) || (isJccAffectedIns && bytesCrossedBoundary == 0))
+            {
+                printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", codeGen->genInsName(id->idIns()),
+                       bytesCrossedBoundary);
+            }
+            else
+            {
+                printf("...............................");
+            }
+            printf(" %dB boundary ...............................\n", (emitComp->opts.compJitAlignLoopBoundary));
+        }
+    }
 #endif
 
     return is;
@@ -4479,6 +4610,428 @@ void emitter::emitJumpDistBind()
 #endif // DEBUG
 }
 
+#if FEATURE_LOOP_ALIGN
+
+//-----------------------------------------------------------------------------
+// emitLoopAlignment: Insert an align instruction at the end of emitCurIG and
+//                    mark it as IGF_LOOP_ALIGN to indicate that next IG  is a
+//                    loop needing alignment.
+//
+void emitter::emitLoopAlignment()
+{
+    if ((emitComp->opts.compJitAlignLoopBoundary > 16) && (!emitComp->opts.compJitAlignLoopAdaptive))
+    {
+        emitLongLoopAlign(emitComp->opts.compJitAlignLoopBoundary);
+    }
+    else
+    {
+        emitLoopAlign();
+    }
+
+    // Mark this IG as need alignment so during emitter we can check the instruction count heuristics of
+    // all IGs that follows this IG and participate in a loop.
+    emitCurIG->igFlags |= IGF_LOOP_ALIGN;
+
+    JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u.\n", emitComp->opts.compJitAlignLoopBoundary,
+            emitComp->compMethodID, emitCurIG->igNum);
+}
+
+//-----------------------------------------------------------------------------
+//  emitEndsWithAlignInstr: Checks if current IG ends with loop align instruction.
+//
+//  Returns:  true if current IG ends with align instruciton.
+//
+bool emitter::emitEndsWithAlignInstr()
+{
+    return emitCurIG->isLoopAlign();
+}
+
+//-----------------------------------------------------------------------------
+//  getLoopSize: Starting from loopHeaderIg, find the size of the smallest possible loop
+//               such that it doesn't exceed the maxLoopSize.
+//
+//  Arguments:
+//       igLoopHeader - The header IG of a loop
+//       maxLoopSize  - Maximum loop size. If the loop is bigger than this value, we will just
+//                      return this value.
+//
+//  Returns:  size of a loop in bytes.
+//
+unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize)
+{
+    unsigned loopSize = 0;
+
+    for (insGroup* igInLoop = igLoopHeader; igInLoop != nullptr; igInLoop = igInLoop->igNext)
+    {
+        loopSize += igInLoop->igSize;
+        if (igInLoop->isLoopAlign())
+        {
+            // If igInLoop's next IG is a loop and needs alignment, then igInLoop should be the last IG
+            // of the current loop and should have backedge to current loop header.
+            assert(igInLoop->igLoopBackEdge == igLoopHeader);
+
+            // In such cases, the current loop size should exclude the align instruction size reserved for
+            // next loop.
+            loopSize -= emitComp->opts.compJitAlignPaddingLimit;
+        }
+        if ((igInLoop->igLoopBackEdge == igLoopHeader) || (loopSize > maxLoopSize))
+        {
+            break;
+        }
+    }
+
+    return loopSize;
+}
+
+//-----------------------------------------------------------------------------
+// emitSetLoopBackEdge : Sets igLoopBackEdge field, if not already set and
+//                       if currIG has back-edge to dstIG.
+//
+// Notes:
+//    If the current loop encloses a loop that is already marked as align, then remove
+//    the alignment flag present on IG before dstIG.
+//
+void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock)
+{
+    insGroup* dstIG = (insGroup*)loopTopBlock->bbEmitCookie;
+
+    // With (dstIG != nullptr), ensure that only back edges are tracked.
+    // If there is forward jump, dstIG is not yet generated.
+    //
+    // We don't rely on (block->bbJumpDest->bbNum <= block->bbNum) because the basic
+    // block numbering is not guaranteed to be sequential.
+
+    if ((dstIG != nullptr) && (dstIG->igNum <= emitCurIG->igNum))
+    {
+        unsigned currLoopStart = dstIG->igNum;
+        unsigned currLoopEnd   = emitCurIG->igNum;
+
+        // Only mark back-edge if current loop starts after the last inner loop ended.
+        if (emitLastInnerLoopEndIgNum < currLoopStart)
+        {
+            emitCurIG->igLoopBackEdge = dstIG;
+
+            JITDUMP("** IG%02u jumps back to IG%02u forming a loop.\n", currLoopEnd, currLoopStart);
+
+            emitLastInnerLoopStartIgNum = currLoopStart;
+            emitLastInnerLoopEndIgNum   = currLoopEnd;
+        }
+        // Otherwise, mark the dstIG->prevIG as no alignment needed.
+        //
+        // Note: If current loop's back-edge target is same as emitLastInnerLoopStartIgNum,
+        // retain the alignment flag of dstIG->prevIG so the loop
+        // (emitLastInnerLoopStartIgNum ~ emitLastInnerLoopEndIgNum) is still aligned.
+        else if (emitLastInnerLoopStartIgNum != currLoopStart)
+        {
+            // Find the IG before dstIG...
+            instrDescAlign* alignInstr = emitAlignList;
+            while ((alignInstr != nullptr) && (alignInstr->idaIG->igNext != dstIG))
+            {
+                alignInstr = alignInstr->idaNext;
+            }
+
+            // ...and clear the IGF_LOOP_ALIGN flag
+            if (alignInstr != nullptr)
+            {
+                assert(alignInstr->idaIG->igNext == dstIG);
+                alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN;
+            }
+
+            JITDUMP(
+                "** Skip alignment for loop IG%02u ~ IG%02u, because it encloses an aligned loop IG%02u ~ IG%02u.\n",
+                currLoopStart, currLoopEnd, emitLastInnerLoopStartIgNum, emitLastInnerLoopEndIgNum);
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------
+//  emitLoopAlignAdjustments: Walk all the align instructions and update them
+//    with actual padding needed.
+
+//  Notes:
+//     For IGs that have align instructions in the end, calculate the actual offset
+//     of loop start and determine how much padding is needed. Based on that, update
+//     the igOffs, igSize and emitTotalCodeSize.
+//
+void emitter::emitLoopAlignAdjustments()
+{
+    // no align instructions
+    if (emitAlignList == nullptr)
+    {
+        return;
+    }
+
+    JITDUMP("*************** In emitLoopAlignAdjustments()\n");
+
+    unsigned short estimatedPaddingNeeded = emitComp->opts.compJitAlignPaddingLimit;
+    unsigned short alignmentBoundary      = emitComp->opts.compJitAlignLoopBoundary;
+
+    if (emitComp->opts.compJitAlignLoopAdaptive)
+    {
+        // For adaptive, adjust the loop size depending on the alignment boundary
+        int maxBlocksAllowedForLoop = genLog2((unsigned)alignmentBoundary) - 1;
+    }
+
+    unsigned        alignBytesRemoved = 0;
+    unsigned        loopSize          = 0;
+    unsigned        loopIGOffset      = 0;
+    instrDescAlign* alignInstr        = emitAlignList;
+
+    for (; alignInstr != nullptr; alignInstr = alignInstr->idaNext)
+    {
+        assert(alignInstr->idIns() == INS_align);
+
+        insGroup* alignIG = alignInstr->idaIG;
+
+        loopIGOffset = alignIG->igOffs + alignIG->igSize;
+
+        // igSize also includes INS_align instruction, take it off.
+        loopIGOffset -= estimatedPaddingNeeded;
+
+        // IG can be marked as not needing alignment if during setting igLoopBackEdge, it is detected
+        // that the igLoopBackEdge encloses an IG that is marked for alignment.
+        unsigned actualPaddingNeeded =
+            alignIG->isLoopAlign() ? emitCalculatePaddingForLoopAlignment(alignIG, loopIGOffset DEBUG_ARG(false)) : 0;
+
+        assert(estimatedPaddingNeeded >= actualPaddingNeeded);
+
+        unsigned short diff = (unsigned short)(estimatedPaddingNeeded - actualPaddingNeeded);
+
+        if (diff != 0)
+        {
+            alignIG->igSize -= diff;
+            alignBytesRemoved += diff;
+            emitTotalCodeSize -= diff;
+
+            // Update the flags
+            alignIG->igFlags |= IGF_UPD_ISZ;
+            if (actualPaddingNeeded == 0)
+            {
+                alignIG->igFlags &= ~IGF_LOOP_ALIGN;
+            }
+
+            if (emitComp->opts.compJitAlignLoopAdaptive)
+            {
+                assert(actualPaddingNeeded < MAX_ENCODED_SIZE);
+                alignInstr->idCodeSize(actualPaddingNeeded);
+            }
+            else
+            {
+                unsigned paddingToAdj = actualPaddingNeeded;
+
+#ifdef DEBUG
+
+                int instrAdjusted = (alignmentBoundary + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE;
+#endif
+                // Adjust the padding amount in all align instructions in this IG
+                instrDescAlign *alignInstrToAdj = alignInstr, *prevAlignInstr = nullptr;
+                for (; alignInstrToAdj != nullptr && alignInstrToAdj->idaIG == alignInstr->idaIG;
+                     alignInstrToAdj = alignInstrToAdj->idaNext)
+                {
+                    unsigned newPadding = min(paddingToAdj, MAX_ENCODED_SIZE);
+                    alignInstrToAdj->idCodeSize(newPadding);
+                    paddingToAdj -= newPadding;
+                    prevAlignInstr = alignInstrToAdj;
+#ifdef DEBUG
+                    instrAdjusted--;
+#endif
+                }
+                assert(paddingToAdj == 0);
+                assert(instrAdjusted == 0);
+
+                // fast forward the align instruction to next IG
+                alignInstr = prevAlignInstr;
+            }
+
+            JITDUMP("Adjusted alignment of G_M%03u_IG%02u from %02d to %02d\n", emitComp->compMethodID, alignIG->igNum,
+                    estimatedPaddingNeeded, actualPaddingNeeded);
+        }
+
+        // Adjust the offset of all IGs starting from next IG until we reach the IG having the next
+        // align instruction or the end of IG list.
+        insGroup* adjOffIG     = alignIG->igNext;
+        insGroup* adjOffUptoIG = alignInstr->idaNext != nullptr ? alignInstr->idaNext->idaIG : emitIGlast;
+        while ((adjOffIG != nullptr) && (adjOffIG->igNum <= adjOffUptoIG->igNum))
+        {
+            adjOffIG->igOffs -= alignBytesRemoved;
+            adjOffIG = adjOffIG->igNext;
+        }
+
+        if (actualPaddingNeeded > 0)
+        {
+            // Record the last IG that has align instruction. No overestimation
+            // adjustment will be done after emitLastAlignedIgNum.
+            emitLastAlignedIgNum = alignIG->igNum;
+        }
+    }
+
+#ifdef DEBUG
+    emitCheckIGoffsets();
+#endif
+}
+
+//-----------------------------------------------------------------------------
+//  emitCalculatePaddingForLoopAlignment: Calculate the padding to insert at the
+//    end of 'ig' so the loop that starts after 'ig' is aligned.
+//
+//  Returns: Padding amount.
+//    0 means no padding is needed, either because loop is already aligned or it
+//    is too expensive to align loop and hence it will not be aligned.
+//
+//  Notes:
+//     Below are the steps (in this order) to calculate the padding amount.
+//     1. If loop is already aligned to desired boundary, then return 0. // already aligned
+//     2. If loop size exceed maximum allowed loop size, then return 0.  // already aligned
+//
+// For adaptive loop alignment:
+//     3a. Calculate paddingNeeded and maxPaddingAmount to align to 32B boundary.
+//     3b. If paddingNeeded > maxPaddingAmount, then recalculate to align to 16B boundary.
+//     3b. If paddingNeeded == 0, then return 0. // already aligned at 16B
+//     3c. If paddingNeeded > maxPaddingAmount, then return 0. // expensive to align
+//     3d. If the loop already fits in minimum 32B blocks, then return 0. // already best aligned
+//     3e. return paddingNeeded.
+//
+// For non-adaptive loop alignment:
+//     3a. Calculate paddingNeeded.
+//     3b. If the loop already fits in minimum alignmentBoundary blocks, then return 0. // already best aligned
+//     3c. return paddingNeeded.
+//
+unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig,
+                                                       size_t offset DEBUG_ARG(bool displayAlignmentDetails))
+{
+    assert(ig->isLoopAlign());
+    unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
+
+    // No padding if loop is already aligned
+    if ((offset & (alignmentBoundary - 1)) == 0)
+    {
+        JITDUMP(";; Skip alignment: 'Loop already aligned at %dB boundary.'\n", alignmentBoundary);
+        return 0;
+    }
+
+    unsigned maxLoopSize          = 0;
+    int      maxLoopBlocksAllowed = 0;
+
+    if (emitComp->opts.compJitAlignLoopAdaptive)
+    {
+        // For adaptive, adjust the loop size depending on the alignment boundary
+        maxLoopBlocksAllowed = genLog2((unsigned)alignmentBoundary) - 1;
+        maxLoopSize          = alignmentBoundary * maxLoopBlocksAllowed;
+    }
+    else
+    {
+        // For non-adaptive, just take whatever is supplied using COMPlus_ variables
+        maxLoopSize = emitComp->opts.compJitAlignLoopMaxCodeSize;
+    }
+
+    unsigned loopSize = getLoopSize(ig->igNext, maxLoopSize);
+
+    // No padding if loop is big
+    if (loopSize > maxLoopSize)
+    {
+        JITDUMP(";; Skip alignment: 'Loop is big. LoopSize= %d, MaxLoopSize= %d.'\n", alignmentBoundary, loopSize,
+                maxLoopSize);
+        return 0;
+    }
+
+    unsigned paddingToAdd           = 0;
+    unsigned minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+    bool     skipPadding            = false;
+
+    if (emitComp->opts.compJitAlignLoopAdaptive)
+    {
+        // adaptive loop alignment
+        unsigned nMaxPaddingBytes = (1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1)) - 1;
+        unsigned nPaddingBytes    = (-(int)(size_t)offset) & (alignmentBoundary - 1);
+
+        // Check if the alignment exceeds maxPadding limit
+        if (nPaddingBytes > nMaxPaddingBytes)
+        {
+            // Cannot align to 32B, so try to align to 16B boundary.
+            alignmentBoundary >>= 1;
+            nMaxPaddingBytes = 1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1);
+            nPaddingBytes    = (-(int)(size_t)offset) & (alignmentBoundary - 1);
+
+            // Check if the loop is already at new alignment boundary
+            if (nPaddingBytes == 0)
+            {
+                skipPadding = true;
+                JITDUMP(";; Skip alignment: 'Loop already aligned at 16B boundary.'\n");
+            }
+            // Check if the alignment exceeds new maxPadding limit
+            else if (nPaddingBytes > nMaxPaddingBytes)
+            {
+                skipPadding = true;
+                JITDUMP(";; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
+                        "AlignmentBoundary= %dB.'\n",
+                        nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary);
+            }
+        }
+
+        // If within maxPaddingLimit
+        if (!skipPadding)
+        {
+            // Padding is needed only if loop starts at or after the current offset.
+            // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+            size_t extraBytesNotInLoop =
+                (size_t)(emitComp->opts.compJitAlignLoopBoundary * minBlocksNeededForLoop) - loopSize;
+            size_t currentOffset = (size_t)offset % alignmentBoundary;
+
+            if (currentOffset > extraBytesNotInLoop)
+            {
+                // Padding is needed only if loop starts at or after the current offset and hence might not
+                // fit in minBlocksNeededForLoop
+                paddingToAdd = nPaddingBytes;
+            }
+            else
+            {
+                // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+                JITDUMP(";; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n",
+                        minBlocksNeededForLoop, alignmentBoundary);
+            }
+        }
+    }
+    else
+    {
+        // non-adaptive loop alignment
+        unsigned extraBytesNotInLoop = (alignmentBoundary * minBlocksNeededForLoop) - loopSize;
+        unsigned currentOffset       = (size_t)offset % alignmentBoundary;
+
+#ifdef DEBUG
+        // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary
+        if (emitComp->opts.compJitAlignLoopForJcc)
+        {
+            // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing?
+            currentOffset++;
+        }
+#endif
+
+        if (currentOffset > extraBytesNotInLoop)
+        {
+            // Padding is needed only if loop starts at or after the current offset and hence might not
+            // fit in minBlocksNeededForLoop
+            paddingToAdd = (-(int)(size_t)offset) & (alignmentBoundary - 1);
+        }
+        else
+        {
+            // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+            JITDUMP(";; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n", minBlocksNeededForLoop,
+                    alignmentBoundary);
+        }
+    }
+
+    JITDUMP(";; Calculated padding to add %d bytes to align at %dB boundary that starts at 0x%x.'\n", paddingToAdd,
+            alignmentBoundary, offset);
+
+    // Either no padding is added because it is too expensive or the offset gets aligned
+    // to the alignment boundary
+    assert(paddingToAdd == 0 || (((offset + paddingToAdd) & (alignmentBoundary - 1)) == 0));
+
+    return paddingToAdd;
+}
+
+#endif
+
 void emitter::emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG)
 {
 #ifdef DEBUG
@@ -4841,6 +5394,13 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
                             (void**)&codeBlock, (void**)&coldCodeBlock, (void**)&consBlock);
 #endif
 
+#ifdef DEBUG
+    if ((allocMemFlag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0)
+    {
+        assert(((size_t)codeBlock & 31) == 0);
+    }
+#endif
+
     // if (emitConsDsc.dsdOffs)
     //     printf("Cons=%08X\n", consBlock);
 
@@ -5374,14 +5934,10 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
     {
         printf("\n");
     }
-
-    if (emitComp->verbose)
-    {
-        printf("Allocated method code size = %4u , actual size = %4u\n", emitTotalCodeSize, cp - codeBlock);
-    }
 #endif
 
     unsigned actualCodeSize = emitCurCodeOffs(cp);
+    assert(emitTotalCodeSize >= actualCodeSize);
 
 #if EMITTER_STATS
     totAllocdSize += emitTotalCodeSize;
@@ -5391,7 +5947,11 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
     // Fill in eventual unused space, but do not report this space as used.
     // If you add this padding during the emitIGlist loop, then it will
     // emit offsets after the loop with wrong value (for example for GC ref variables).
-    unsigned unusedSize = emitTotalCodeSize - emitCurCodeOffs(cp);
+    unsigned unusedSize = emitTotalCodeSize - actualCodeSize;
+
+    JITDUMP("Allocated method code size = %4u , actual size = %4u, unused size = %4u\n", emitTotalCodeSize,
+            actualCodeSize, unusedSize);
+
     for (unsigned i = 0; i < unusedSize; ++i)
     {
         *cp++ = DEFAULT_CODE_BUFFER_INIT;
@@ -7215,6 +7775,10 @@ void emitter::emitInitIG(insGroup* ig)
     ig->igSize   = 0;
     ig->igGCregs = RBM_NONE;
     ig->igInsCnt = 0;
+
+#if FEATURE_LOOP_ALIGN
+    ig->igLoopBackEdge = nullptr;
+#endif
 }
 
 /*****************************************************************************
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 0942a2df4ad93d..8030cc4b0fb16d 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -251,6 +251,10 @@ struct insGroup
     unsigned short igFlags;   // see IGF_xxx below
     unsigned short igSize;    // # of bytes of code in this group
 
+#if FEATURE_LOOP_ALIGN
+    insGroup* igLoopBackEdge; // "last" back-edge that branches back to an aligned loop head.
+#endif
+
 #define IGF_GC_VARS 0x0001    // new set of live GC ref variables
 #define IGF_BYREF_REGS 0x0002 // new set of live by-ref registers
 #if defined(FEATURE_EH_FUNCLETS) && defined(TARGET_ARM)
@@ -264,6 +268,8 @@ struct insGroup
 #define IGF_PLACEHOLDER 0x0100    // this is a placeholder group, to be filled in later
 #define IGF_EXTEND 0x0200         // this block is conceptually an extension of the previous block
                                   // and the emitter should continue to track GC info as if there was no new block.
+#define IGF_LOOP_ALIGN 0x0400     // this group contains alignment instruction(s) at the end; the next IG is the
+                                  // head of a loop that needs alignment.
 
 // Mask of IGF_* flags that should be propagated to new blocks when they are created.
 // This allows prologs and epilogs to be any number of IGs, but still be
@@ -336,6 +342,11 @@ struct insGroup
         return *(unsigned*)ptr;
     }
 
+    bool isLoopAlign()
+    {
+        return (igFlags & IGF_LOOP_ALIGN) != 0;
+    }
+
 }; // end of struct insGroup
 
 //  For AMD64 the maximum prolog/epilog size supported on the OS is 256 bytes
@@ -561,6 +572,7 @@ class emitter
 #if defined(TARGET_XARCH)
         static_assert_no_msg(INS_count <= 1024);
         instruction _idIns : 10;
+#define MAX_ENCODED_SIZE 15
 #elif defined(TARGET_ARM64)
         static_assert_no_msg(INS_count <= 512);
         instruction _idIns : 9;
@@ -1361,6 +1373,14 @@ class emitter
                                   // hot to cold and cold to hot jumps)
     };
 
+#if FEATURE_LOOP_ALIGN
+    struct instrDescAlign : instrDesc
+    {
+        instrDescAlign* idaNext; // next align in the group/method
+        insGroup*       idaIG;   // containing group
+    };
+#endif
+
 #if !defined(TARGET_ARM64) // This shouldn't be needed for ARM32, either, but I don't want to touch the ARM32 JIT.
     struct instrDescLbl : instrDescJmp
     {
@@ -1738,6 +1758,21 @@ class emitter
     instrDescJmp* emitJumpLast;       // last of local jumps in method
     void          emitJumpDistBind(); // Bind all the local jumps in method
 
+#if FEATURE_LOOP_ALIGN
+    instrDescAlign* emitCurIGAlignList;                                 // list of align instructions in current IG
+    unsigned        emitLastInnerLoopStartIgNum;                        // Start IG of last inner loop
+    unsigned        emitLastInnerLoopEndIgNum;                          // End IG of last inner loop
+    unsigned        emitLastAlignedIgNum;                               // last IG that has align instruction
+    instrDescAlign* emitAlignList;                                      // list of local align instructions in method
+    instrDescAlign* emitAlignLast;                                      // last align instruction in method
+    unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize); // Get the smallest loop size
+    void emitLoopAlignment();
+    bool emitEndsWithAlignInstr(); // Validate if newLabel is appropriate
+    void emitSetLoopBackEdge(BasicBlock* loopTopBlock);
+    void     emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments
+    unsigned emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool displayAlignmentDetails));
+#endif
+
     void emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG); // Check for illegal branches between funclets
 
     bool emitFwdJumps;   // forward jumps present?
@@ -1903,7 +1938,7 @@ class emitter
     instrDescJmp* emitAllocInstrJmp()
     {
 #if EMITTER_STATS
-        emitTotalIDescJmpCnt++;
+        emitTotalDescAlignCnt++;
 #endif // EMITTER_STATS
         return (instrDescJmp*)emitAllocAnyInstr(sizeof(instrDescJmp), EA_1BYTE);
     }
@@ -1978,6 +2013,17 @@ class emitter
         return (instrDescCGCA*)emitAllocAnyInstr(sizeof(instrDescCGCA), attr);
     }
 
+#if FEATURE_LOOP_ALIGN
+    instrDescAlign* emitAllocInstrAlign()
+    {
+#if EMITTER_STATS
+        emitTotalIDescJmpCnt++;
+#endif // EMITTER_STATS
+        return (instrDescAlign*)emitAllocAnyInstr(sizeof(instrDescAlign), EA_1BYTE);
+    }
+    instrDescAlign* emitNewInstrAlign();
+#endif
+
     instrDesc* emitNewInstrSmall(emitAttr attr);
     instrDesc* emitNewInstr(emitAttr attr = EA_4BYTE);
     instrDesc* emitNewInstrSC(emitAttr attr, cnsval_ssize_t cns);
@@ -2299,6 +2345,7 @@ class emitter
 #define SMALL_CNS_TSZ 256
     static unsigned emitSmallCns[SMALL_CNS_TSZ];
     static unsigned emitLargeCnsCnt;
+    static unsigned emitTotalDescAlignCnt;
 
     static unsigned emitIFcounts[IF_COUNT];
 
@@ -2501,6 +2548,15 @@ inline emitter::instrDescJmp* emitter::emitNewInstrJmp()
     return emitAllocInstrJmp();
 }
 
+#if FEATURE_LOOP_ALIGN
+inline emitter::instrDescAlign* emitter::emitNewInstrAlign()
+{
+    instrDescAlign* newInstr = emitAllocInstrAlign();
+    newInstr->idIns(INS_align);
+    return newInstr;
+}
+#endif
+
 #if !defined(TARGET_ARM64)
 inline emitter::instrDescLbl* emitter::emitNewInstrLbl()
 {
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index e91f0cf6d55c95..b6ca4dd7030a3e 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -874,9 +874,16 @@ unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, c
         //   * W must be unset                    (0x00 validates bit 7)
         if ((vexPrefix & 0xFFFF7F80) == 0x00C46100)
         {
-            emitOutputByte(dst, 0xC5);
-            emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
-            return 2;
+            // Encoding optimization calculation is not done while estimating the instruction
+            // size and thus over-predict instruction size by 1 byte.
+            // If there are IGs that will be aligned, do not optimize encoding so the
+            // estimated alignment sizes are accurate.
+            if (emitCurIG->igNum > emitLastAlignedIgNum)
+            {
+                emitOutputByte(dst, 0xC5);
+                emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
+                return 2;
+            }
         }
 
         emitOutputByte(dst, ((vexPrefix >> 16) & 0xFF));
@@ -2651,22 +2658,62 @@ emitter::instrDesc* emitter::emitNewInstrAmdCns(emitAttr size, ssize_t dsp, int
     }
 }
 
-/*****************************************************************************
- *
- *  The next instruction will be a loop head entry point
- *  So insert a dummy instruction here to ensure that
- *  the x86 I-cache alignment rule is followed.
- */
-
-void emitter::emitLoopAlign()
+//-----------------------------------------------------------------------------
+//
+//  The next instruction will be a loop head entry point
+//  So insert an alignment instruction here to ensure that
+//  we can properly align the code.
+//
+void emitter::emitLoopAlign(unsigned short paddingBytes)
 {
     /* Insert a pseudo-instruction to ensure that we align
        the next instruction properly */
 
-    instrDesc* id = emitNewInstrSmall(EA_1BYTE);
-    id->idIns(INS_align);
-    id->idCodeSize(15); // We may need to skip up to 15 bytes of code
-    emitCurIGsize += 15;
+    assert(paddingBytes <= MAX_ENCODED_SIZE);
+    paddingBytes       = min(paddingBytes, MAX_ENCODED_SIZE); // We may need to skip up to 15 bytes of code
+    instrDescAlign* id = emitNewInstrAlign();
+    id->idCodeSize(paddingBytes);
+    emitCurIGsize += paddingBytes;
+
+    id->idaIG = emitCurIG;
+
+    /* Append this instruction to this IG's alignment list */
+    id->idaNext        = emitCurIGAlignList;
+    emitCurIGAlignList = id;
+}
+
+//-----------------------------------------------------------------------------
+//
+//  The next instruction will be a loop head entry point
+//  So insert alignment instruction(s) here to ensure that
+//  we can properly align the code.
+//
+//  This emits more than one `INS_align` instruction depending on the
+//  alignmentBoundary parameter.
+//
+void emitter::emitLongLoopAlign(unsigned short alignmentBoundary)
+{
+    unsigned short nPaddingBytes    = alignmentBoundary - 1;
+    unsigned short nAlignInstr      = (nPaddingBytes + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE;
+    unsigned short instrDescSize    = nAlignInstr * sizeof(instrDescAlign);
+    unsigned short insAlignCount    = nPaddingBytes / MAX_ENCODED_SIZE;
+    unsigned short lastInsAlignSize = nPaddingBytes % MAX_ENCODED_SIZE;
+
+    // Ensure that all align instructions fall in same IG.
+    if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp)
+    {
+        emitForceNewIG = true;
+    }
+
+    /* Insert a pseudo-instruction to ensure that we align
+    the next instruction properly */
+
+    while (insAlignCount)
+    {
+        emitLoopAlign();
+        insAlignCount--;
+    }
+    emitLoopAlign(lastInsAlignSize);
 }
 
 /*****************************************************************************
@@ -2676,7 +2723,7 @@ void emitter::emitLoopAlign()
 
 void emitter::emitIns_Nop(unsigned size)
 {
-    assert(size <= 15);
+    assert(size <= MAX_ENCODED_SIZE);
 
     instrDesc* id = emitNewInstr();
     id->idIns(INS_nop);
@@ -7341,6 +7388,12 @@ size_t emitter::emitSizeOfInsDsc(instrDesc* id)
     switch (idOp)
     {
         case ID_OP_NONE:
+#if FEATURE_LOOP_ALIGN
+            if (id->idIns() == INS_align)
+            {
+                return sizeof(instrDescAlign);
+            }
+#endif
             break;
 
         case ID_OP_LBL:
@@ -9325,6 +9378,49 @@ static BYTE* emitOutputNOP(BYTE* dst, size_t nBytes)
     return dst;
 }
 
+//--------------------------------------------------------------------
+// emitOutputAlign: Outputs NOP to align the loop
+//
+// Arguments:
+//   ig - Current instruction group
+//   id - align instruction that holds amount of padding (NOPs) to add
+//   dst - Destination buffer
+//
+// Return Value:
+//   None.
+//
+// Notes:
+//   Amount of padding needed to align the loop is already calculated. This
+//   method extracts that information and inserts suitable NOP instructions.
+//
+BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst)
+{
+    // Candidate for loop alignment
+    assert(codeGen->ShouldAlignLoops());
+    assert(ig->isLoopAlign());
+
+    unsigned paddingToAdd = id->idCodeSize();
+
+    // Either things are already aligned or align them here.
+    assert((paddingToAdd == 0) || (((size_t)dst & (emitComp->opts.compJitAlignLoopBoundary - 1)) != 0));
+
+    // Padding amount should not exceed the alignment boundary
+    assert(0 <= paddingToAdd && paddingToAdd < emitComp->opts.compJitAlignLoopBoundary);
+
+#ifdef DEBUG
+    bool     displayAlignmentDetails = (emitComp->opts.disAsm /*&& emitComp->opts.disAddr*/) || emitComp->verbose;
+    unsigned paddingNeeded           = emitCalculatePaddingForLoopAlignment(ig, (size_t)dst, displayAlignmentDetails);
+
+    // For non-adaptive, padding size is spread in multiple instructions, so don't bother checking
+    if (emitComp->opts.compJitAlignLoopAdaptive)
+    {
+        assert(paddingToAdd == paddingNeeded);
+    }
+#endif
+
+    return emitOutputNOP(dst, paddingToAdd);
+}
+
 /*****************************************************************************
  *
  *  Output an instruction involving an address mode.
@@ -12398,7 +12494,8 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i)
 #ifdef DEBUG
             if (emitComp->verbose)
             {
-                printf("; NOTE: size of jump [%08X] mis-predicted\n", emitComp->dspPtr(id));
+                printf("; NOTE: size of jump [%08X] mis-predicted by %d bytes\n", emitComp->dspPtr(id),
+                       (id->idCodeSize() - JMP_SIZE_SMALL));
             }
 #endif
         }
@@ -12559,10 +12656,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 {
     assert(emitIssuing);
 
-    BYTE*         dst           = *dp;
-    size_t        sz            = sizeof(instrDesc);
-    instruction   ins           = id->idIns();
-    unsigned char callInstrSize = 0;
+    BYTE*         dst               = *dp;
+    size_t        sz                = sizeof(instrDesc);
+    instruction   ins               = id->idIns();
+    unsigned char callInstrSize     = 0;
+    int           emitOffsAdjBefore = emitOffsAdj;
 
 #ifdef DEBUG
     bool dspOffs = emitComp->opts.dspGCtbls;
@@ -12598,9 +12696,21 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             // the loop alignment pseudo instruction
             if (ins == INS_align)
             {
-                sz  = SMALL_IDSC_SIZE;
-                dst = emitOutputNOP(dst, (-(int)(size_t)dst) & 0x0f);
-                assert(((size_t)dst & 0x0f) == 0);
+                sz = sizeof(instrDescAlign);
+                // IG can be marked as not needing alignment after emitting align instruction
+                // In such case, skip outputting alignment.
+                if (ig->isLoopAlign())
+                {
+                    dst = emitOutputAlign(ig, id, dst);
+                }
+#ifdef DEBUG
+                else
+                {
+                    // If the IG is not marked as need alignment, then the code size
+                    // should be zero i.e. no padding needed.
+                    assert(id->idCodeSize() == 0);
+                }
+#endif
                 break;
             }
 
@@ -13704,7 +13814,49 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
     {
         emitDispIns(id, false, dspOffs, true, emitCurCodeOffs(*dp), *dp, (dst - *dp));
     }
+#endif
 
+#if FEATURE_LOOP_ALIGN
+    // Only compensate over-estimated instructions if emitCurIG is before
+    // the last IG that needs alignment.
+    if (emitCurIG->igNum <= emitLastAlignedIgNum)
+    {
+        int diff = id->idCodeSize() - ((UNATIVE_OFFSET)(dst - *dp));
+        assert(diff >= 0);
+        if (diff != 0)
+        {
+
+#ifdef DEBUG
+            // should never over-estimate align instruction
+            assert(id->idIns() != INS_align);
+            JITDUMP("Added over-estimation compensation: %d\n", diff);
+
+            if (emitComp->opts.disAsm)
+            {
+                emitDispInsAddr(dst);
+                printf("\t\t  ;; NOP compensation instructions of %d bytes.\n", diff);
+            }
+#endif
+
+            dst = emitOutputNOP(dst, diff);
+
+            // since we compensated the over-estimation, revert the offsAdj that
+            // might have happened in the jump
+            if (emitOffsAdjBefore != emitOffsAdj)
+            {
+#ifdef DEBUG
+                insFormat format = id->idInsFmt();
+                assert((format == IF_LABEL) || (format == IF_RWR_LABEL) || (format == IF_SWR_LABEL));
+                assert(diff == (emitOffsAdj - emitOffsAdjBefore));
+#endif
+                emitOffsAdj -= diff;
+            }
+        }
+        assert((id->idCodeSize() - ((UNATIVE_OFFSET)(dst - *dp))) == 0);
+    }
+#endif
+
+#ifdef DEBUG
     if (emitComp->compDebugBreak)
     {
         // set JitEmitPrintRefRegs=1 will print out emitThisGCrefRegs and emitThisByrefRegs
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index fb2aac2d30f0d9..b0a8327acedb69 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -50,6 +50,7 @@ UNATIVE_OFFSET emitInsSizeAM(instrDesc* id, code_t code, int val);
 UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code);
 UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code, int val);
 
+BYTE* emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst);
 BYTE* emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
 BYTE* emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
 BYTE* emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
@@ -287,7 +288,9 @@ inline emitAttr emitDecodeScale(unsigned ensz)
 /************************************************************************/
 
 public:
-void emitLoopAlign();
+void emitLoopAlign(unsigned short paddingBytes = 15);
+
+void emitLongLoopAlign(unsigned short alignmentBoundary);
 
 void emitIns(instruction ins);
 
diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
index e70f192fd9500a..b0a49030dacd4a 100644
--- a/src/coreclr/jit/flowgraph.cpp
+++ b/src/coreclr/jit/flowgraph.cpp
@@ -9642,9 +9642,9 @@ BasicBlock* Compiler::fgSplitBlockAtEnd(BasicBlock* curr)
     newBlock->bbFlags = curr->bbFlags;
 
     // Remove flags that the new block can't have.
-    newBlock->bbFlags &=
-        ~(BBF_TRY_BEG | BBF_LOOP_HEAD | BBF_LOOP_CALL0 | BBF_LOOP_CALL1 | BBF_HAS_LABEL | BBF_JMP_TARGET |
-          BBF_FUNCLET_BEG | BBF_LOOP_PREHEADER | BBF_KEEP_BBJ_ALWAYS | BBF_PATCHPOINT | BBF_BACKWARD_JUMP_TARGET);
+    newBlock->bbFlags &= ~(BBF_TRY_BEG | BBF_LOOP_HEAD | BBF_LOOP_CALL0 | BBF_LOOP_CALL1 | BBF_HAS_LABEL |
+                           BBF_JMP_TARGET | BBF_FUNCLET_BEG | BBF_LOOP_PREHEADER | BBF_KEEP_BBJ_ALWAYS |
+                           BBF_PATCHPOINT | BBF_BACKWARD_JUMP_TARGET | BBF_LOOP_ALIGN);
 
     // Remove the GC safe bit on the new block. It seems clear that if we split 'curr' at the end,
     // such that all the code is left in 'curr', and 'newBlock' just gets the control flow, then
@@ -10946,6 +10946,18 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext)
             break;
     }
 
+    // Add the LOOP_ALIGN flag
+    if (bNext->isLoopAlign())
+    {
+        // Only if the new block is jump target or has label
+        if (((block->bbFlags & BBF_JMP_TARGET) != 0) || ((block->bbFlags & BBF_HAS_LABEL) != 0))
+        {
+            block->bbFlags |= BBF_LOOP_ALIGN;
+            JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " during compacting.\n", bNext->bbNum,
+                    block->bbNum);
+        }
+    }
+
     // If we're collapsing a block created after the dominators are
     // computed, copy block number the block and reuse dominator
     // information from bNext to block.
@@ -11536,6 +11548,14 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable)
         if (block->isLoopHead() && (succBlock->bbNum <= block->bbNum))
         {
             succBlock->bbFlags |= BBF_LOOP_HEAD;
+
+            if (block->isLoopAlign())
+            {
+                succBlock->bbFlags |= BBF_LOOP_ALIGN;
+                JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " for loop# %d.", block->bbNum,
+                        succBlock->bbNum, block->bbNatLoopNum);
+            }
+
             if (fgDomsComputed && fgReachable(succBlock, block))
             {
                 /* Mark all the reachable blocks between 'succBlock' and 'block', excluding 'block' */
diff --git a/src/coreclr/jit/jit.h b/src/coreclr/jit/jit.h
index 9fb780dbd40c66..62e7ac8059b16d 100644
--- a/src/coreclr/jit/jit.h
+++ b/src/coreclr/jit/jit.h
@@ -747,6 +747,10 @@ class Histogram
 #define CLFLG_STRUCTPROMOTE 0x00000
 #endif
 
+#ifdef TARGET_XARCH
+#define FEATURE_LOOP_ALIGN 1
+#endif
+
 #define CLFLG_MAXOPT                                                                                                   \
     (CLFLG_CSE | CLFLG_REGVAR | CLFLG_RNGCHKOPT | CLFLG_DEADASGN | CLFLG_CODEMOTION | CLFLG_QMARK | CLFLG_TREETRANS |  \
      CLFLG_INLINING | CLFLG_STRUCTPROMOTE | CLFLG_CONSTANTFOLD)
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 865ae3033f09aa..5ffab7c0f29e96 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -41,6 +41,27 @@ CONFIG_INTEGER(JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0) // In deb
                                                                        // optimizations are performed on the fast path.
 CONFIG_INTEGER(JitDefaultFill, W("JitDefaultFill"), 0xdd) // In debug builds, initialize the memory allocated by the nra
                                                           // with this byte.
+CONFIG_INTEGER(JitAlignLoopMinBlockWeight,
+               W("JitAlignLoopMinBlockWeight"),
+               DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT) // Minimum weight needed for the first block of a loop to make it a
+                                                    // candidate for alignment.
+CONFIG_INTEGER(JitAlignLoopMaxCodeSize,
+               W("JitAlignLoopMaxCodeSize"),
+               DEFAULT_MAX_LOOPSIZE_FOR_ALIGN) // For non-adaptive alignment, minimum loop size (in bytes) for which
+                                               // alignment will be done.
+                                               // Defaults to 3 blocks of 32 bytes chunks = 96 bytes.
+CONFIG_INTEGER(JitAlignLoopBoundary,
+               W("JitAlignLoopBoundary"),
+               DEFAULT_ALIGN_LOOP_BOUNDARY) // For non-adaptive alignment, address boundary (power of 2) at which loop
+                                            // alignment should be done. By default, 32B.
+CONFIG_INTEGER(JitAlignLoopForJcc,
+               W("JitAlignLoopForJcc"),
+               0) // If set, for non-adaptive alignment, ensure loop jmps are not on or cross alignment boundary.
+
+CONFIG_INTEGER(JitAlignLoopAdaptive,
+               W("JitAlignLoopAdaptive"),
+               1) // If set, perform adaptive loop alignment that limits number of padding based on loop size.
+
 CONFIG_INTEGER(JitDirectAlloc, W("JitDirectAlloc"), 0)
 CONFIG_INTEGER(JitDoubleAlign, W("JitDoubleAlign"), 1)
 CONFIG_INTEGER(JitDumpASCII, W("JitDumpASCII"), 1)         // Uses only ASCII characters in tree dumps
@@ -202,6 +223,12 @@ CONFIG_INTEGER(EnableIncompleteISAClass, W("EnableIncompleteISAClass"), 0) // En
                                                                            // intrinsic classes
 #endif                                                                     // defined(DEBUG)
 
+#if FEATURE_LOOP_ALIGN
+CONFIG_INTEGER(JitAlignLoops, W("JitAlignLoops"), 1) // If set, align inner loops
+#else
+CONFIG_INTEGER(JitAlignLoops, W("JitAlignLoops"), 0)
+#endif
+
 ///
 /// JIT
 ///
diff --git a/src/coreclr/jit/jitee.h b/src/coreclr/jit/jitee.h
index 298536138b2e1a..6301166e489c0f 100644
--- a/src/coreclr/jit/jitee.h
+++ b/src/coreclr/jit/jitee.h
@@ -63,45 +63,45 @@ class JitFlags
         JIT_FLAG_BBINSTR                 = 29, // Collect basic block profile information
         JIT_FLAG_BBOPT                   = 30, // Optimize method based on profile information
         JIT_FLAG_FRAMED                  = 31, // All methods have an EBP frame
-        JIT_FLAG_ALIGN_LOOPS             = 32, // add NOPs before loops to align them at 16 byte boundaries
+        JIT_FLAG_UNUSED12                = 32,
         JIT_FLAG_PUBLISH_SECRET_PARAM    = 33, // JIT must place stub secret param into local 0.  (used by IL stubs)
-        JIT_FLAG_UNUSED12                = 34,
+        JIT_FLAG_UNUSED13                = 34,
         JIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background
         JIT_FLAG_USE_PINVOKE_HELPERS     = 36, // The JIT should use the PINVOKE_{BEGIN,END} helpers instead of emitting inline transitions
         JIT_FLAG_REVERSE_PINVOKE         = 37, // The JIT should insert REVERSE_PINVOKE_{ENTER,EXIT} helpers into method prolog/epilog
-        JIT_FLAG_UNUSED13                = 38,
+        JIT_FLAG_UNUSED14                = 38,
         JIT_FLAG_TIER0                   = 39, // This is the initial tier for tiered compilation which should generate code as quickly as possible
         JIT_FLAG_TIER1                   = 40, // This is the final tier (for now) for tiered compilation which should generate high quality code
 
 #if defined(TARGET_ARM)
         JIT_FLAG_RELATIVE_CODE_RELOCS    = 41, // JIT should generate PC-relative address computations instead of EE relocation records
 #else // !defined(TARGET_ARM)
-        JIT_FLAG_UNUSED14                = 41,
+        JIT_FLAG_UNUSED15                = 41,
 #endif // !defined(TARGET_ARM)
 
         JIT_FLAG_NO_INLINING             = 42, // JIT should not inline any called method into this method
 
-        JIT_FLAG_UNUSED15                = 43,
-        JIT_FLAG_UNUSED16                = 44,
-        JIT_FLAG_UNUSED17                = 45,
-        JIT_FLAG_UNUSED18                = 46,
-        JIT_FLAG_UNUSED19                = 47,
-        JIT_FLAG_UNUSED20                = 48,
-        JIT_FLAG_UNUSED21                = 49,
-        JIT_FLAG_UNUSED22                = 50,
-        JIT_FLAG_UNUSED23                = 51,
-        JIT_FLAG_UNUSED24                = 52,
-        JIT_FLAG_UNUSED25                = 53,
-        JIT_FLAG_UNUSED26                = 54,
-        JIT_FLAG_UNUSED27                = 55,
-        JIT_FLAG_UNUSED28                = 56,
-        JIT_FLAG_UNUSED29                = 57,
-        JIT_FLAG_UNUSED30                = 58,
-        JIT_FLAG_UNUSED31                = 59,
-        JIT_FLAG_UNUSED32                = 60,
-        JIT_FLAG_UNUSED33                = 61,
-        JIT_FLAG_UNUSED34                = 62,
-        JIT_FLAG_UNUSED35                = 63
+        JIT_FLAG_UNUSED16                = 43,
+        JIT_FLAG_UNUSED17                = 44,
+        JIT_FLAG_UNUSED18                = 45,
+        JIT_FLAG_UNUSED19                = 46,
+        JIT_FLAG_UNUSED20                = 47,
+        JIT_FLAG_UNUSED21                = 48,
+        JIT_FLAG_UNUSED22                = 49,
+        JIT_FLAG_UNUSED23                = 50,
+        JIT_FLAG_UNUSED24                = 51,
+        JIT_FLAG_UNUSED25                = 52,
+        JIT_FLAG_UNUSED26                = 53,
+        JIT_FLAG_UNUSED27                = 54,
+        JIT_FLAG_UNUSED28                = 55,
+        JIT_FLAG_UNUSED29                = 56,
+        JIT_FLAG_UNUSED30                = 57,
+        JIT_FLAG_UNUSED31                = 58,
+        JIT_FLAG_UNUSED32                = 59,
+        JIT_FLAG_UNUSED33                = 60,
+        JIT_FLAG_UNUSED34                = 61,
+        JIT_FLAG_UNUSED35                = 62,
+        JIT_FLAG_UNUSED36                = 63
 
     };
     // clang-format on
@@ -201,7 +201,6 @@ class JitFlags
         FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR, JIT_FLAG_BBINSTR);
         FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_BBOPT, JIT_FLAG_BBOPT);
         FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_FRAMED, JIT_FLAG_FRAMED);
-        FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_ALIGN_LOOPS, JIT_FLAG_ALIGN_LOOPS);
         FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_PUBLISH_SECRET_PARAM, JIT_FLAG_PUBLISH_SECRET_PARAM);
         FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_SAMPLING_JIT_BACKGROUND, JIT_FLAG_SAMPLING_JIT_BACKGROUND);
         FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_USE_PINVOKE_HELPERS, JIT_FLAG_USE_PINVOKE_HELPERS);
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index b72d32e5ce2eab..e286bc26d3fba7 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -16317,6 +16317,12 @@ bool Compiler::fgFoldConditional(BasicBlock* block)
                          * Remove the loop from the table */
 
                         optLoopTable[loopNum].lpFlags |= LPFLG_REMOVED;
+#if FEATURE_LOOP_ALIGN
+                        optLoopTable[loopNum].lpFirst->bbFlags &= ~BBF_LOOP_ALIGN;
+                        JITDUMP("Removing LOOP_ALIGN flag from bogus loop in " FMT_BB "\n",
+                                optLoopTable[loopNum].lpFirst->bbNum);
+#endif
+
 #ifdef DEBUG
                         if (verbose)
                         {
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index e134915cfe9d39..ddadd938fcfc68 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2578,6 +2578,41 @@ void Compiler::optFindNaturalLoops()
 #endif // DEBUG
 }
 
+//-----------------------------------------------------------------------------
+//
+// All the inner loops that whose block weight meets a threshold are marked
+// as needing alignment.
+//
+
+void Compiler::optIdentifyLoopsForAlignment()
+{
+#if FEATURE_LOOP_ALIGN
+    if (codeGen->ShouldAlignLoops())
+    {
+        for (unsigned char loopInd = 0; loopInd < optLoopCount; loopInd++)
+        {
+            BasicBlock* first = optLoopTable[loopInd].lpFirst;
+
+            // An innerloop candidate that might need alignment
+            if (optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP)
+            {
+                if (first->getBBWeight(this) >= (opts.compJitAlignLoopMinBlockWeight * BB_UNITY_WEIGHT))
+                {
+                    first->bbFlags |= BBF_LOOP_ALIGN;
+                    JITDUMP("L%02u that starts at " FMT_BB " needs alignment, weight=%f.\n", loopInd, first->bbNum,
+                            first->getBBWeight(this));
+                }
+                else
+                {
+                    JITDUMP("Skip alignment for L%02u that starts at " FMT_BB " weight=%f.\n", loopInd, first->bbNum,
+                            first->getBBWeight(this));
+                }
+            }
+        }
+    }
+#endif
+}
+
 void Compiler::optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap)
 {
     BasicBlock* newJumpDest = nullptr;
@@ -3757,6 +3792,22 @@ void Compiler::optUnrollLoops()
 #endif
         }
 
+#if FEATURE_LOOP_ALIGN
+        for (block = head->bbNext;; block = block->bbNext)
+        {
+            if (block->isLoopAlign())
+            {
+                block->bbFlags &= ~BBF_LOOP_ALIGN;
+                JITDUMP("Removing LOOP_ALIGN flag from unrolled loop in " FMT_BB "\n", block->bbNum);
+            }
+
+            if (block == bottom)
+            {
+                break;
+            }
+        }
+#endif
+
         /* Create the unrolled loop statement list */
         {
             BlockToBlockMap blockMap(getAllocator());
@@ -4506,6 +4557,10 @@ void Compiler::optOptimizeLoops()
             }
         }
 
+        // Check if any of the loops need alignment
+
+        optIdentifyLoopsForAlignment();
+
 #if COUNT_LOOPS
         totalUnnatLoopCount += loopNum;
 #endif
@@ -5146,9 +5201,10 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context)
 {
     assert(loopInd < optLoopCount);
 
-    JITDUMP("\nCloning loop %d: [h: %d, f: %d, t: %d, e: %d, b: %d].\n", loopInd, optLoopTable[loopInd].lpHead->bbNum,
-            optLoopTable[loopInd].lpFirst->bbNum, optLoopTable[loopInd].lpTop->bbNum,
-            optLoopTable[loopInd].lpEntry->bbNum, optLoopTable[loopInd].lpBottom->bbNum);
+    JITDUMP("\nCloning loop %d: [h: %d, f: %d, t: %d, e: %d, b: %d, c: %d].\n", loopInd,
+            optLoopTable[loopInd].lpHead->bbNum, optLoopTable[loopInd].lpFirst->bbNum,
+            optLoopTable[loopInd].lpTop->bbNum, optLoopTable[loopInd].lpEntry->bbNum,
+            optLoopTable[loopInd].lpBottom->bbNum, optLoopTable[loopInd].lpChild);
 
     // Determine the depth of the loop, so we can properly weight blocks added (outside the cloned loop blocks).
     unsigned             depth         = optLoopDepth(loopInd);
@@ -7975,6 +8031,20 @@ bool Compiler::optComputeLoopSideEffectsOfBlock(BasicBlock* blk)
 // Marks the containsCall information to "lnum" and any parent loops.
 void Compiler::AddContainsCallAllContainingLoops(unsigned lnum)
 {
+
+#if FEATURE_LOOP_ALIGN
+    // If this is the inner most loop, reset the LOOP_ALIGN flag
+    // because a loop having call will not likely to benefit from
+    // alignment
+    if (optLoopTable[lnum].lpChild == BasicBlock::NOT_IN_LOOP)
+    {
+        BasicBlock* first = optLoopTable[lnum].lpFirst;
+        first->bbFlags &= ~BBF_LOOP_ALIGN;
+        JITDUMP("Removing LOOP_ALIGN flag for L%02u that starts at " FMT_BB " because loop has a call.\n", lnum,
+                first->bbNum);
+    }
+#endif
+
     assert(0 <= lnum && lnum < optLoopCount);
     while (lnum != BasicBlock::NOT_IN_LOOP)
     {
diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs
index 79768f5fbdb9eb..1aadd4e2664542 100644
--- a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs
+++ b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs
@@ -1307,13 +1307,13 @@ public enum CorJitFlag : uint
         CORJIT_FLAG_BBINSTR = 29, // Collect basic block profile information
         CORJIT_FLAG_BBOPT = 30, // Optimize method based on profile information
         CORJIT_FLAG_FRAMED = 31, // All methods have an EBP frame
-        CORJIT_FLAG_ALIGN_LOOPS = 32, // add NOPs before loops to align them at 16 byte boundaries
+        CORJIT_FLAG_UNUSED8 = 32,
         CORJIT_FLAG_PUBLISH_SECRET_PARAM = 33, // JIT must place stub secret param into local 0.  (used by IL stubs)
-        CORJIT_FLAG_UNUSED8 = 34,
+        CORJIT_FLAG_UNUSED9 = 34,
         CORJIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background
         CORJIT_FLAG_USE_PINVOKE_HELPERS = 36, // The JIT should use the PINVOKE_{BEGIN,END} helpers instead of emitting inline transitions
         CORJIT_FLAG_REVERSE_PINVOKE = 37, // The JIT should insert REVERSE_PINVOKE_{ENTER,EXIT} helpers into method prolog/epilog
-        CORJIT_FLAG_UNUSED9 = 38,
+        CORJIT_FLAG_UNUSED10 = 38,
         CORJIT_FLAG_TIER0 = 39, // This is the initial tier for tiered compilation which should generate code as quickly as possible
         CORJIT_FLAG_TIER1 = 40, // This is the final tier (for now) for tiered compilation which should generate high quality code
         CORJIT_FLAG_RELATIVE_CODE_RELOCS = 41, // JIT should generate PC-relative address computations instead of EE relocation records
diff --git a/src/coreclr/vm/eeconfig.cpp b/src/coreclr/vm/eeconfig.cpp
index 389e4024e8c3bf..c1336060d21b71 100644
--- a/src/coreclr/vm/eeconfig.cpp
+++ b/src/coreclr/vm/eeconfig.cpp
@@ -118,7 +118,6 @@ HRESULT EEConfig::Init()
 
     iJitOptimizeType = OPT_DEFAULT;
     fJitFramed = false;
-    fJitAlignLoops = false;
     fJitMinOpts = false;
     fPInvokeRestoreEsp = (DWORD)-1;
 
@@ -689,7 +688,6 @@ fTrackDynamicMethodDebugInfo = CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_
     dwJitHostMaxSlabCache = CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_JitHostMaxSlabCache);
 
     fJitFramed = (GetConfigDWORD_DontUse_(CLRConfig::UNSUPPORTED_JitFramed, fJitFramed) != 0);
-    fJitAlignLoops = (GetConfigDWORD_DontUse_(CLRConfig::UNSUPPORTED_JitAlignLoops, fJitAlignLoops) != 0);
     fJitMinOpts = (GetConfigDWORD_DontUse_(CLRConfig::UNSUPPORTED_JITMinOpts, fJitMinOpts) == 1);
     iJitOptimizeType      =  GetConfigDWORD_DontUse_(CLRConfig::EXTERNAL_JitOptimizeType, iJitOptimizeType);
     if (iJitOptimizeType > OPT_RANDOM)     iJitOptimizeType = OPT_DEFAULT;
diff --git a/src/coreclr/vm/eeconfig.h b/src/coreclr/vm/eeconfig.h
index 46616fa1f5d002..a068e447117e18 100644
--- a/src/coreclr/vm/eeconfig.h
+++ b/src/coreclr/vm/eeconfig.h
@@ -75,7 +75,6 @@ class EEConfig
     bool          GetTrackDynamicMethodDebugInfo(void)      const {LIMITED_METHOD_CONTRACT;  return fTrackDynamicMethodDebugInfo; }
     unsigned int  GenOptimizeType(void)                     const {LIMITED_METHOD_CONTRACT;  return iJitOptimizeType; }
     bool          JitFramed(void)                           const {LIMITED_METHOD_CONTRACT;  return fJitFramed; }
-    bool          JitAlignLoops(void)                       const {LIMITED_METHOD_CONTRACT;  return fJitAlignLoops; }
     bool          JitMinOpts(void)                          const {LIMITED_METHOD_CONTRACT;  return fJitMinOpts; }
 
     // Tiered Compilation config
@@ -537,7 +536,6 @@ class EEConfig
     DWORD dwJitHostMaxSlabCache;       // max size for jit host slab cache
     bool fTrackDynamicMethodDebugInfo; //  Enable/Disable tracking dynamic method debug info
     bool fJitFramed;                   // Enable/Disable EBP based frames
-    bool fJitAlignLoops;               // Enable/Disable loop alignment
     bool fJitMinOpts;                  // Enable MinOpts for all jitted methods
 
     unsigned iJitOptimizeType; // 0=Blended,1=SmallCode,2=FastCode,              default is 0=Blended
diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp
index 0d60059283a3d0..aa60a55ceb3e27 100644
--- a/src/coreclr/vm/jitinterface.cpp
+++ b/src/coreclr/vm/jitinterface.cpp
@@ -12676,8 +12676,6 @@ CorJitResult CallCompileMethodWithSEHWrapper(EEJitManager *jitMgr,
     CORJIT_FLAGS flags;
     if (g_pConfig->JitFramed())
         flags.Set(CORJIT_FLAGS::CORJIT_FLAG_FRAMED);
-    if (g_pConfig->JitAlignLoops())
-        flags.Set(CORJIT_FLAGS::CORJIT_FLAG_ALIGN_LOOPS);
 #ifdef TARGET_X86
     if (g_pConfig->PInvokeRestoreEsp(ftn->GetModule()->IsPreV4Assembly()))
         flags.Set(CORJIT_FLAGS::CORJIT_FLAG_PINVOKE_RESTORE_ESP);