From 32236ac48254f69cca61446875d4844ba1d1f785 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 8 Oct 2020 15:37:43 -0700
Subject: [PATCH 01/59] Detect inner loop and add 10 bytes of padding at the
 beginning

---
 src/coreclr/jit/block.cpp         |  4 ++++
 src/coreclr/jit/block.h           |  1 +
 src/coreclr/jit/codegenlinear.cpp |  5 +++++
 src/coreclr/jit/emitarm.cpp       | 17 +++++++++++++++++
 src/coreclr/jit/emitarm.h         |  2 ++
 src/coreclr/jit/emitarm64.cpp     | 17 +++++++++++++++++
 src/coreclr/jit/emitarm64.h       |  2 ++
 src/coreclr/jit/optimizer.cpp     |  6 +++++-
 8 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp
index f2b14599335f5f..ff3902ed564ef7 100644
--- a/src/coreclr/jit/block.cpp
+++ b/src/coreclr/jit/block.cpp
@@ -505,6 +505,10 @@ void BasicBlock::dspFlags()
     {
         printf("cfe ");
     }
+    if (bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP)
+    {
+        printf("finnerloop ");
+    }
 }
 
 /*****************************************************************************
diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h
index 02c37361e831ce..148dd24557c731 100644
--- a/src/coreclr/jit/block.h
+++ b/src/coreclr/jit/block.h
@@ -448,6 +448,7 @@ struct BasicBlock : private LIR::Range
 
 #define BBF_PATCHPOINT                     MAKE_BBFLAG(36) // Block is a patchpoint
 #define BBF_HAS_CLASS_PROFILE              MAKE_BBFLAG(37) // BB contains a call needing a class profile
+#define BBF_FIRST_BLOCK_IN_INNERLOOP      0x8000000000 // Block is lexically the fist block within the innermost loop.
 
 // clang-format on
 
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index bf8d1ce087adf9..76af00c9a14b27 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -427,6 +427,11 @@ void CodeGen::genCodeForBBlist()
         }
 #endif // DEBUG
 
+        if (block->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP)
+        {
+            GetEmitter()->emitIns_Nop(10);
+        }
+
         IL_OFFSETX currentILOffset = BAD_IL_OFFSET;
         for (GenTree* node : LIR::AsRange(block).NonPhiNodes())
         {
diff --git a/src/coreclr/jit/emitarm.cpp b/src/coreclr/jit/emitarm.cpp
index a2001cf0ffcd5a..66aa7c8b201510 100644
--- a/src/coreclr/jit/emitarm.cpp
+++ b/src/coreclr/jit/emitarm.cpp
@@ -1468,6 +1468,23 @@ void emitter::emitIns(instruction ins)
     appendToCurIG(id);
 }
 
+/*****************************************************************************
+ *
+ *  Add a NOP instructions to pad the instruction stream by (size / 4) bytes.
+ */
+
+void emitter::emitIns_Nop(unsigned size)
+{
+    assert(size > 0);
+    // Max out at 28 bytes of nop...
+    // 32 is the largest method entry alignment we support.
+    size = size % 32; // TODO: should be different for arm64?
+    for (unsigned i = 0; i < size / 4; i++)
+    {
+        emitIns(INS_nop);
+    }
+}
+
 /*****************************************************************************
  *
  *  Add an instruction with a single immediate value.
diff --git a/src/coreclr/jit/emitarm.h b/src/coreclr/jit/emitarm.h
index e663a953e7a10b..e718bf50d667ec 100644
--- a/src/coreclr/jit/emitarm.h
+++ b/src/coreclr/jit/emitarm.h
@@ -215,6 +215,8 @@ static bool emitIns_valid_imm_for_vldst_offset(int imm);
 
 void emitIns(instruction ins);
 
+void emitIns_Nop(unsigned size);
+
 void emitIns_I(instruction ins, emitAttr attr, target_ssize_t imm);
 
 void emitIns_R(instruction ins, emitAttr attr, regNumber reg);
diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp
index 2f78d20a713967..cd31cfe353d081 100644
--- a/src/coreclr/jit/emitarm64.cpp
+++ b/src/coreclr/jit/emitarm64.cpp
@@ -3625,6 +3625,23 @@ void emitter::emitIns(instruction ins)
     appendToCurIG(id);
 }
 
+/*****************************************************************************
+ *
+ *  Add a NOP instructions to pad the instruction stream by (size / 4) bytes.
+ */
+
+void emitter::emitIns_Nop(unsigned size)
+{
+    assert(size > 0);
+    // Max out at 28 bytes of nop...
+    // 32 is the largest method entry alignment we support.
+    size = size % 32; //TODO: should be different for arm64?
+    for (unsigned i = 0; i < size / 4; i++)
+    {
+        emitIns(INS_nop);
+    }
+}
+
 /*****************************************************************************
  *
  *  Add an instruction with a single immediate value.
diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h
index 8b8c812aacfc41..da0000b13d2fa6 100644
--- a/src/coreclr/jit/emitarm64.h
+++ b/src/coreclr/jit/emitarm64.h
@@ -723,6 +723,8 @@ inline static ssize_t computeRelPageAddr(size_t dstAddr, size_t srcAddr)
 public:
 void emitIns(instruction ins);
 
+void emitIns_Nop(unsigned size);
+
 void emitIns_I(instruction ins, emitAttr attr, ssize_t imm);
 
 void emitIns_R(instruction ins, emitAttr attr, regNumber reg);
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index e134915cfe9d39..cbe67509424b62 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -1653,7 +1653,7 @@ class LoopSearch
 
         if (top->bbNum > bottom->bbNum) // is this a backward edge? (from BOTTOM to TOP)
         {
-            // Edge from BOTTOM to TOP is not a backward edge
+            // Edge from TOP to BOTTOM is not a backward edge
             return false;
         }
 
@@ -2542,6 +2542,10 @@ void Compiler::optFindNaturalLoops()
             }
             assert(blk->bbNext != nullptr); // We should never reach nullptr.
         }
+        if (optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP)
+        {
+            first->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
+        }
     }
 
     // Make sure that loops are canonical: that every loop has a unique "top", by creating an empty "nop"

From a2ec5d13911c6fabf01395b3b1da878cb1431876 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 8 Oct 2020 18:14:37 -0700
Subject: [PATCH 02/59] generate nop in previous blocks

---
 src/coreclr/jit/codegenlinear.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index 76af00c9a14b27..727ae84ba5e739 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -427,11 +427,6 @@ void CodeGen::genCodeForBBlist()
         }
 #endif // DEBUG
 
-        if (block->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP)
-        {
-            GetEmitter()->emitIns_Nop(10);
-        }
-
         IL_OFFSETX currentILOffset = BAD_IL_OFFSET;
         for (GenTree* node : LIR::AsRange(block).NonPhiNodes())
         {
@@ -465,6 +460,11 @@ void CodeGen::genCodeForBBlist()
             }
         } // end for each node in block
 
+        if (block->bbNext != nullptr && block->bbNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP)
+        {
+            GetEmitter()->emitIns_Nop(10);
+        }
+
 #ifdef DEBUG
         // The following set of register spill checks and GC pointer tracking checks used to be
         // performed at statement boundaries. Now, with LIR, there are no statements, so they are

From 89b58124ee50f8991d2a80a975b647240370d362 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 8 Oct 2020 18:14:52 -0700
Subject: [PATCH 03/59] TODO: figure out if anything needs to be done in
 optCanonicalizeLoop

---
 src/coreclr/jit/optimizer.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index cbe67509424b62..22ea20bc115b6e 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2902,8 +2902,15 @@ bool Compiler::optCanonicalizeLoop(unsigned char loopInd)
     {
         optLoopTable[loopInd].lpEntry = newT;
     }
+    //assert((optLoopTable[loopInd].lpFirst->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP) == 0);
     optLoopTable[loopInd].lpTop   = newT;
     optLoopTable[loopInd].lpFirst = newT;
+    // Something to investigate
+    /*if ((optLoopTable[loopInd].lpFirst->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP) != 0)
+    {
+        newT->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
+    }
+    newT->bbFlags |= (optLoopTable[loopInd].lpFirst->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP);*/
 
     newT->bbNatLoopNum = loopInd;
 

From a84b8be672ef2affe52956232800e3a8cac698f8 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 9 Oct 2020 14:16:54 -0700
Subject: [PATCH 04/59] Add COMPlus_JitAlignLoopMinBlockWeight and
 COMPlus_JitAlignLoopMaxCodeSize

- Add 2 variables to control which loops get aligned
- Moved padding after the conditional/unconditional jump of previous block
---
 src/coreclr/jit/codegenlinear.cpp | 14 +++++++++-----
 src/coreclr/jit/compiler.cpp      |  3 +++
 src/coreclr/jit/compiler.h        | 11 +++++++++++
 src/coreclr/jit/jitconfigvalues.h |  5 +++++
 src/coreclr/jit/optimizer.cpp     |  4 +++-
 5 files changed, 31 insertions(+), 6 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index 727ae84ba5e739..a30d5f98f694a0 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -460,11 +460,6 @@ void CodeGen::genCodeForBBlist()
             }
         } // end for each node in block
 
-        if (block->bbNext != nullptr && block->bbNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP)
-        {
-            GetEmitter()->emitIns_Nop(10);
-        }
-
 #ifdef DEBUG
         // The following set of register spill checks and GC pointer tracking checks used to be
         // performed at statement boundaries. Now, with LIR, there are no statements, so they are
@@ -755,6 +750,15 @@ void CodeGen::genCodeForBBlist()
                 break;
         }
 
+        if ((block != nullptr) && (block->bbNext != nullptr) && (block->bbNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP))
+        {
+            if (verbose)
+            {
+                printf("To align next block, add padding.\n");
+            }
+            GetEmitter()->emitIns_Nop(10);
+        }
+
 #if defined(DEBUG) && defined(USING_VARIABLE_LIVE_RANGE)
         if (compiler->verbose)
         {
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index b25afff5a6fdaa..6043ad6bb7e7c7 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2615,6 +2615,9 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     opts.compDbgInfo = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_INFO);
     opts.compDbgEnC  = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_EnC);
 
+    compJitAlignLoopMinBlockWeight = JitConfig.JitAlignLoopMinBlockWeight();
+    compJitAlignLoopMaxCodeSize = JitConfig.JitAlignLoopMaxCodeSize();
+
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
     // We never want to have debugging enabled when regenerating GC encoding patterns
     opts.compDbgCode = false;
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 1cdebfb9c3c8aa..582790cdb11953 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -2234,6 +2234,17 @@ class Compiler
 public:
     hashBvGlobalData hbvGlobalData; // Used by the hashBv bitvector package.
 
+/*
+* Loop alignment heuristics
+* These are overriden by the COMPlus_ variables, but in future, 
+*/
+
+//#define ALIGN_LOOP_MIN_BB_WEIGHT 100 // Minimum average hits a block should get in order to be considered as hot for loop alignment.
+//#define ALIGN_LOOP_MAX_CODE_SIZE 20 // Maximum code size of a loop for which loop alignment will be done.
+
+    unsigned compJitAlignLoopMinBlockWeight;
+    unsigned compJitAlignLoopMaxCodeSize;
+
 #ifdef DEBUG
     bool verbose;
     bool verboseTrees;
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 865ae3033f09aa..e9a3f0500b1633 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -41,6 +41,11 @@ CONFIG_INTEGER(JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0) // In deb
                                                                        // optimizations are performed on the fast path.
 CONFIG_INTEGER(JitDefaultFill, W("JitDefaultFill"), 0xdd) // In debug builds, initialize the memory allocated by the nra
                                                           // with this byte.
+CONFIG_INTEGER(JitAlignLoopMinBlockWeight, W("JitAlignLoopMinBlockWeight"), 0xffffffff) // Minimum weight needed of the first block of a loop to trigger its alignment.
+CONFIG_INTEGER(JitAlignLoopMaxCodeSize,
+               W("JitAlignLoopMaxCodeSize"),
+               0)  // Maximum code size (in bytes) of the loop after which the the code alignment for that loop will be disabled.
+
 CONFIG_INTEGER(JitDirectAlloc, W("JitDirectAlloc"), 0)
 CONFIG_INTEGER(JitDoubleAlign, W("JitDoubleAlign"), 1)
 CONFIG_INTEGER(JitDumpASCII, W("JitDumpASCII"), 1)         // Uses only ASCII characters in tree dumps
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 22ea20bc115b6e..3c025e85d1220f 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2542,7 +2542,9 @@ void Compiler::optFindNaturalLoops()
             }
             assert(blk->bbNext != nullptr); // We should never reach nullptr.
         }
-        if (optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP)
+
+        // An innerloop candidate that might need alignment
+        if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) && (compJitAlignLoopMinBlockWeight <= first->getBBWeight(this)))
         {
             first->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
         }

From a51759af6c6d327309736c029d7da22e8931e23f Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 9 Oct 2020 17:41:42 -0700
Subject: [PATCH 05/59] Reuse AlignLoops flag for dynamic loop alignment

---
 src/coreclr/jit/codegenlinear.cpp | 19 ++++++++-----------
 src/coreclr/jit/emitarm.cpp       |  6 ++----
 src/coreclr/jit/emitarm.h         |  2 +-
 src/coreclr/jit/emitarm64.cpp     |  6 ++----
 src/coreclr/jit/emitarm64.h       |  2 +-
 5 files changed, 14 insertions(+), 21 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index a30d5f98f694a0..a6e8ee109d61a8 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -311,13 +311,6 @@ void CodeGen::genCodeForBBlist()
 
         genUpdateCurrentFunclet(block);
 
-#ifdef TARGET_XARCH
-        if (ShouldAlignLoops() && block->bbFlags & BBF_LOOP_HEAD)
-        {
-            GetEmitter()->emitLoopAlign();
-        }
-#endif
-
         genLogLabel(block);
 
         // Tell everyone which basic block we're working on
@@ -750,13 +743,17 @@ void CodeGen::genCodeForBBlist()
                 break;
         }
 
-        if ((block != nullptr) && (block->bbNext != nullptr) && (block->bbNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP))
+        if (ShouldAlignLoops())
         {
-            if (verbose)
+            if ((block->bbNext != nullptr) && (block->bbNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP))
             {
-                printf("To align next block, add padding.\n");
+                if (verbose)
+                {
+                    printf("To align next block, add padding.\n");
+                }
+                //GetEmitter()->emitIns_Nop(10);
+                GetEmitter()->emitLoopAlign();
             }
-            GetEmitter()->emitIns_Nop(10);
         }
 
 #if defined(DEBUG) && defined(USING_VARIABLE_LIVE_RANGE)
diff --git a/src/coreclr/jit/emitarm.cpp b/src/coreclr/jit/emitarm.cpp
index 66aa7c8b201510..48f754b3ab06f5 100644
--- a/src/coreclr/jit/emitarm.cpp
+++ b/src/coreclr/jit/emitarm.cpp
@@ -1473,13 +1473,11 @@ void emitter::emitIns(instruction ins)
  *  Add a NOP instructions to pad the instruction stream by (size / 4) bytes.
  */
 
-void emitter::emitIns_Nop(unsigned size)
+void emitter::emitLoopAlign()
 {
-    assert(size > 0);
     // Max out at 28 bytes of nop...
     // 32 is the largest method entry alignment we support.
-    size = size % 32; // TODO: should be different for arm64?
-    for (unsigned i = 0; i < size / 4; i++)
+    for (unsigned i = 0; i < 4; i++)
     {
         emitIns(INS_nop);
     }
diff --git a/src/coreclr/jit/emitarm.h b/src/coreclr/jit/emitarm.h
index e718bf50d667ec..b1e2512d2b09d1 100644
--- a/src/coreclr/jit/emitarm.h
+++ b/src/coreclr/jit/emitarm.h
@@ -215,7 +215,7 @@ static bool emitIns_valid_imm_for_vldst_offset(int imm);
 
 void emitIns(instruction ins);
 
-void emitIns_Nop(unsigned size);
+void emitLoopAlign();
 
 void emitIns_I(instruction ins, emitAttr attr, target_ssize_t imm);
 
diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp
index cd31cfe353d081..7a2df2a4b0ac21 100644
--- a/src/coreclr/jit/emitarm64.cpp
+++ b/src/coreclr/jit/emitarm64.cpp
@@ -3630,13 +3630,11 @@ void emitter::emitIns(instruction ins)
  *  Add a NOP instructions to pad the instruction stream by (size / 4) bytes.
  */
 
-void emitter::emitIns_Nop(unsigned size)
+void emitter::emitLoopAlign()
 {
-    assert(size > 0);
     // Max out at 28 bytes of nop...
     // 32 is the largest method entry alignment we support.
-    size = size % 32; //TODO: should be different for arm64?
-    for (unsigned i = 0; i < size / 4; i++)
+    for (unsigned i = 0; i < 4; i++)
     {
         emitIns(INS_nop);
     }
diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h
index da0000b13d2fa6..403ee087082b9c 100644
--- a/src/coreclr/jit/emitarm64.h
+++ b/src/coreclr/jit/emitarm64.h
@@ -723,7 +723,7 @@ inline static ssize_t computeRelPageAddr(size_t dstAddr, size_t srcAddr)
 public:
 void emitIns(instruction ins);
 
-void emitIns_Nop(unsigned size);
+void emitLoopAlign();
 
 void emitIns_I(instruction ins, emitAttr attr, ssize_t imm);
 

From 2f57c7885c04f24604427c8a5a7278f2a115fd9a Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 14 Oct 2020 14:23:21 -0700
Subject: [PATCH 06/59] Detect back edge and count no. of instructions before
 doing loop alignment

---
 src/coreclr/jit/codegenlinear.cpp | 41 ++++++++++++++++++++++++-------
 src/coreclr/jit/compiler.cpp      |  2 +-
 src/coreclr/jit/compiler.h        |  2 +-
 src/coreclr/jit/emit.cpp          |  1 +
 src/coreclr/jit/emit.h            |  3 +++
 src/coreclr/jit/emitxarch.cpp     | 30 +++++++++++++++++++---
 src/coreclr/jit/jitconfigvalues.h |  4 +--
 src/coreclr/jit/optimizer.cpp     | 12 ++++++---
 8 files changed, 76 insertions(+), 19 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index a6e8ee109d61a8..c16e3608330ad4 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -734,8 +734,30 @@ void CodeGen::genCodeForBBlist()
 #endif // !FEATURE_EH_FUNCLETS
 
             case BBJ_NONE:
-            case BBJ_COND:
+                break;
+
+            //TODO: Should this be done for BB_ALWAYS as well?
             case BBJ_SWITCH:
+            case BBJ_COND:
+                if (block->bbJumpDest->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP)
+                {
+                    // Track the destination IG which is the first block of inner loop.
+                    // In emitter, this will be used to calculate total instructions present
+                    // in all IGs that participate in a loop.
+
+                    insGroup* srcIG = GetEmitter()->emitCurIG;
+                    insGroup* dstIG = (insGroup*)block->bbJumpDest->bbEmitCookie;
+
+                    // Only track back edges to the loop.
+                    if (dstIG->igNum <= srcIG->igNum)
+                    {
+                        srcIG->igLoopBackEdge = dstIG;
+                        if (verbose)
+                        {
+                            printf("** IG_%d jumps back to IG_%d forming a loop.\n", srcIG->igNum, dstIG->igNum);
+                        }
+                    }
+                }
                 break;
 
             default:
@@ -743,17 +765,18 @@ void CodeGen::genCodeForBBlist()
                 break;
         }
 
-        if (ShouldAlignLoops())
+        if ((block->bbNext != nullptr) && (block->bbNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP))
         {
-            if ((block->bbNext != nullptr) && (block->bbNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP))
+            assert(false);
+            if (verbose)
             {
-                if (verbose)
-                {
-                    printf("To align next block, add padding.\n");
-                }
-                //GetEmitter()->emitIns_Nop(10);
-                GetEmitter()->emitLoopAlign();
+                printf("Adding 'align' instruction to align loop header block " FMT_BB, block->bbNext->bbNum);
             }
+            GetEmitter()->emitLoopAlign();
+
+            // Mark this IG as need alignment so during emitter we can check the instruction count heuristics of
+            // all IGs that follows this IG and participate in a loop.
+            GetEmitter()->emitCurIG->igFlags |= IGF_ALIGN_LOOP;
         }
 
 #if defined(DEBUG) && defined(USING_VARIABLE_LIVE_RANGE)
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 6043ad6bb7e7c7..7529b3341bfcbf 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2616,7 +2616,7 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     opts.compDbgEnC  = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_EnC);
 
     compJitAlignLoopMinBlockWeight = JitConfig.JitAlignLoopMinBlockWeight();
-    compJitAlignLoopMaxCodeSize = JitConfig.JitAlignLoopMaxCodeSize();
+    compJitAlignLoopMaxInstrCount = JitConfig.JitAlignLoopMaxInstrCount();
 
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
     // We never want to have debugging enabled when regenerating GC encoding patterns
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 582790cdb11953..69c0207301c274 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -2243,7 +2243,7 @@ class Compiler
 //#define ALIGN_LOOP_MAX_CODE_SIZE 20 // Maximum code size of a loop for which loop alignment will be done.
 
     unsigned compJitAlignLoopMinBlockWeight;
-    unsigned compJitAlignLoopMaxCodeSize;
+    unsigned compJitAlignLoopMaxInstrCount;
 
 #ifdef DEBUG
     bool verbose;
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 3056c71e6a0932..6a35812f808927 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -7215,6 +7215,7 @@ void emitter::emitInitIG(insGroup* ig)
     ig->igSize   = 0;
     ig->igGCregs = RBM_NONE;
     ig->igInsCnt = 0;
+    ig->igLoopBackEdge = nullptr;
 }
 
 /*****************************************************************************
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 0942a2df4ad93d..2088ddad1618e3 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -264,6 +264,7 @@ struct insGroup
 #define IGF_PLACEHOLDER 0x0100    // this is a placeholder group, to be filled in later
 #define IGF_EXTEND 0x0200         // this block is conceptually an extension of the previous block
                                   // and the emitter should continue to track GC info as if there was no new block.
+#define IGF_ALIGN_LOOP 0x0400
 
 // Mask of IGF_* flags that should be propagated to new blocks when they are created.
 // This allows prologs and epilogs to be any number of IGs, but still be
@@ -292,6 +293,7 @@ struct insGroup
 #endif
     regMaskSmall  igGCregs; // set of registers with live GC refs
     unsigned char igInsCnt; // # of instructions  in this group
+    insGroup*     igLoopBackEdge;
 
 #else // REGMASK_BITS
 
@@ -307,6 +309,7 @@ struct insGroup
 #endif
 
     unsigned char igInsCnt; // # of instructions  in this group
+    // TODO: Add loopBackEdge?
 
 #endif // REGMASK_BITS
 
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index e91f0cf6d55c95..f373f2850c7a93 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12598,9 +12598,33 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             // the loop alignment pseudo instruction
             if (ins == INS_align)
             {
-                sz  = SMALL_IDSC_SIZE;
-                dst = emitOutputNOP(dst, (-(int)(size_t)dst) & 0x0f);
-                assert(((size_t)dst & 0x0f) == 0);
+                sz = SMALL_IDSC_SIZE;
+
+                // Candidate for loop alignment
+                if (ig->igFlags & IGF_ALIGN_LOOP)
+                {
+                    unsigned totalInstrCount = 0;
+                    bool foundBackEdge   = false;
+                    for (insGroup* igInLoop = ig->igNext; igInLoop; igInLoop = igInLoop->igNext)
+                    {
+                        totalInstrCount += igInLoop->igInsCnt;
+                        if (igInLoop->igLoopBackEdge == ig)
+                        {
+                            foundBackEdge = true;
+                            break;
+                        }
+                    }
+
+                    assert(foundBackEdge);
+
+                    // Only align if it matches the heuristics
+                    if (totalInstrCount <= emitComp->compJitAlignLoopMaxInstrCount)
+                    {
+                        dst = emitOutputNOP(dst, (-(int)(size_t)dst) & 0x0f);
+                        assert(((size_t)dst & 0x0f) == 0);
+                    }
+                }
+
                 break;
             }
 
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index e9a3f0500b1633..8cf36d9eb7c70a 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -42,8 +42,8 @@ CONFIG_INTEGER(JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0) // In deb
 CONFIG_INTEGER(JitDefaultFill, W("JitDefaultFill"), 0xdd) // In debug builds, initialize the memory allocated by the nra
                                                           // with this byte.
 CONFIG_INTEGER(JitAlignLoopMinBlockWeight, W("JitAlignLoopMinBlockWeight"), 0xffffffff) // Minimum weight needed of the first block of a loop to trigger its alignment.
-CONFIG_INTEGER(JitAlignLoopMaxCodeSize,
-               W("JitAlignLoopMaxCodeSize"),
+CONFIG_INTEGER(JitAlignLoopMaxInstrCount,
+               W("JitAlignLoopMaxInstrCount"),
                0)  // Maximum code size (in bytes) of the loop after which the the code alignment for that loop will be disabled.
 
 CONFIG_INTEGER(JitDirectAlloc, W("JitDirectAlloc"), 0)
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 3c025e85d1220f..0cc83a76b63411 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2543,10 +2543,16 @@ void Compiler::optFindNaturalLoops()
             assert(blk->bbNext != nullptr); // We should never reach nullptr.
         }
 
-        // An innerloop candidate that might need alignment
-        if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) && (compJitAlignLoopMinBlockWeight <= first->getBBWeight(this)))
+        //TODO: Move should align loops flag to jitconfigvalues.h
+        if (codeGen->ShouldAlignLoops())
         {
-            first->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
+            // An innerloop candidate that might need alignment
+            if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) &&
+                (compJitAlignLoopMinBlockWeight <= first->getBBWeight(this)))
+            {
+                assert(false);
+                first->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
+            }
         }
     }
 

From b4848f88f79551208df02e604093f1d527f27f32 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 15 Oct 2020 11:21:04 -0700
Subject: [PATCH 07/59] fix bugs

---
 src/coreclr/jit/codegenlinear.cpp | 17 +++++++++--------
 src/coreclr/jit/emitxarch.cpp     |  9 +++++----
 src/coreclr/jit/optimizer.cpp     |  1 -
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index c16e3608330ad4..fd081cc0fd1283 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -660,10 +660,6 @@ void CodeGen::genCodeForBBlist()
 
         switch (block->bbJumpKind)
         {
-            case BBJ_ALWAYS:
-                inst_JMP(EJ_jmp, block->bbJumpDest);
-                break;
-
             case BBJ_RETURN:
                 genExitCode(block);
                 break;
@@ -734,10 +730,12 @@ void CodeGen::genCodeForBBlist()
 #endif // !FEATURE_EH_FUNCLETS
 
             case BBJ_NONE:
+            case BBJ_SWITCH:
                 break;
 
-            //TODO: Should this be done for BB_ALWAYS as well?
-            case BBJ_SWITCH:
+            case BBJ_ALWAYS:
+                inst_JMP(EJ_jmp, block->bbJumpDest);
+                __fallthrough;
             case BBJ_COND:
                 if (block->bbJumpDest->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP)
                 {
@@ -749,7 +747,11 @@ void CodeGen::genCodeForBBlist()
                     insGroup* dstIG = (insGroup*)block->bbJumpDest->bbEmitCookie;
 
                     // Only track back edges to the loop.
-                    if (dstIG->igNum <= srcIG->igNum)
+                    // Here dstIG != nullptr checks if we have already generated dstIG for a block.
+                    // If block->bbJumpDest was a forward block, it might have not been created yet.
+                    // We don't rely on (block->bbJumpDest->bbNum <= block->bbNum) because the basic
+                    // block numbering is not guaranteed to be sequential.
+                    if (dstIG != nullptr && dstIG->igNum <= srcIG->igNum)
                     {
                         srcIG->igLoopBackEdge = dstIG;
                         if (verbose)
@@ -767,7 +769,6 @@ void CodeGen::genCodeForBBlist()
 
         if ((block->bbNext != nullptr) && (block->bbNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP))
         {
-            assert(false);
             if (verbose)
             {
                 printf("Adding 'align' instruction to align loop header block " FMT_BB, block->bbNext->bbNum);
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index f373f2850c7a93..119bf21568aee6 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12603,12 +12603,13 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 // Candidate for loop alignment
                 if (ig->igFlags & IGF_ALIGN_LOOP)
                 {
-                    unsigned totalInstrCount = 0;
-                    bool foundBackEdge   = false;
-                    for (insGroup* igInLoop = ig->igNext; igInLoop; igInLoop = igInLoop->igNext)
+                    unsigned  totalInstrCount = 0;
+                    bool      foundBackEdge   = false;
+                    insGroup* loopHeaderIg    = ig->igNext;
+                    for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
                     {
                         totalInstrCount += igInLoop->igInsCnt;
-                        if (igInLoop->igLoopBackEdge == ig)
+                        if (igInLoop->igLoopBackEdge == loopHeaderIg)
                         {
                             foundBackEdge = true;
                             break;
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 0cc83a76b63411..599c6d2b01f5be 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2550,7 +2550,6 @@ void Compiler::optFindNaturalLoops()
             if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) &&
                 (compJitAlignLoopMinBlockWeight <= first->getBBWeight(this)))
             {
-                assert(false);
                 first->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
             }
         }

From fe92c6216402cd3c85b72b714d913aeb159236f0 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 15 Oct 2020 15:38:18 -0700
Subject: [PATCH 08/59] propagate the basic block flag

---
 src/coreclr/jit/flowgraph.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
index e70f192fd9500a..202438f75d80e7 100644
--- a/src/coreclr/jit/flowgraph.cpp
+++ b/src/coreclr/jit/flowgraph.cpp
@@ -11048,6 +11048,11 @@ void Compiler::fgUpdateLoopsAfterCompacting(BasicBlock* block, BasicBlock* bNext
             optLoopTable[loopNum].lpEntry = block;
         }
     }
+
+    if (bNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP)
+    {
+        block->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
+    }
 }
 
 /*****************************************************************************************************
@@ -11547,6 +11552,11 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable)
             skipUnmarkLoop = true;
         }
 
+        if (block->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP)
+        {
+            succBlock->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
+        }
+
         noway_assert(succBlock);
 
         // If this is the first Cold basic block update fgFirstColdBlock

From 51c317108fcc20afb6d6dbf7254dfe80a198c502 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 16 Oct 2020 14:54:16 -0700
Subject: [PATCH 09/59] Switch from instrCount to codeSize

---
 src/coreclr/jit/compiler.cpp      |  2 +-
 src/coreclr/jit/compiler.h        |  2 +-
 src/coreclr/jit/emit.cpp          |  6 ++++--
 src/coreclr/jit/emitxarch.cpp     | 14 ++++++--------
 src/coreclr/jit/jitconfigvalues.h |  4 ++--
 5 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 7529b3341bfcbf..4ff6b3e0b720c0 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2616,7 +2616,7 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     opts.compDbgEnC  = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_EnC);
 
     compJitAlignLoopMinBlockWeight = JitConfig.JitAlignLoopMinBlockWeight();
-    compJitAlignLoopMaxInstrCount = JitConfig.JitAlignLoopMaxInstrCount();
+    compJitAlignLoopMaxCodeSize    = JitConfig.JitAlignLoopMaxCodeSize();
 
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
     // We never want to have debugging enabled when regenerating GC encoding patterns
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 69c0207301c274..582790cdb11953 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -2243,7 +2243,7 @@ class Compiler
 //#define ALIGN_LOOP_MAX_CODE_SIZE 20 // Maximum code size of a loop for which loop alignment will be done.
 
     unsigned compJitAlignLoopMinBlockWeight;
-    unsigned compJitAlignLoopMaxInstrCount;
+    unsigned compJitAlignLoopMaxCodeSize;
 
 #ifdef DEBUG
     bool verbose;
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 6a35812f808927..6786a07d11d6d4 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -5061,7 +5061,8 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
 #define DEFAULT_CODE_BUFFER_INIT 0xcc
 
 #ifdef DEBUG
-    *instrCount = 0;
+    *instrCount      = 0;
+    bool isColdBlock = false;
 #endif
     for (insGroup* ig = emitIGlist; ig != nullptr; ig = ig->igNext)
     {
@@ -5073,7 +5074,8 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
             assert(emitCurCodeOffs(cp) == emitTotalHotCodeSize);
 
             assert(coldCodeBlock);
-            cp = coldCodeBlock;
+            cp          = coldCodeBlock;
+            isColdBlock = true;
 #ifdef DEBUG
             if (emitComp->opts.disAsm || emitComp->verbose)
             {
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 119bf21568aee6..d502687266d656 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12603,24 +12603,22 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 // Candidate for loop alignment
                 if (ig->igFlags & IGF_ALIGN_LOOP)
                 {
-                    unsigned  totalInstrCount = 0;
-                    bool      foundBackEdge   = false;
+                    unsigned  totalCodeSize = 0;
                     insGroup* loopHeaderIg    = ig->igNext;
                     for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
                     {
-                        totalInstrCount += igInLoop->igInsCnt;
-                        if (igInLoop->igLoopBackEdge == loopHeaderIg)
+                        totalCodeSize += igInLoop->igSize;
+                        if ((igInLoop->igLoopBackEdge == loopHeaderIg) ||
+                            (totalCodeSize > emitComp->compJitAlignLoopMaxCodeSize))
                         {
-                            foundBackEdge = true;
                             break;
                         }
                     }
 
-                    assert(foundBackEdge);
-
                     // Only align if it matches the heuristics
-                    if (totalInstrCount <= emitComp->compJitAlignLoopMaxInstrCount)
+                    if (totalCodeSize <= emitComp->compJitAlignLoopMaxCodeSize)
                     {
+                        //printf("Aligning loop in %s.\n", emitComp->info.compMethodName);
                         dst = emitOutputNOP(dst, (-(int)(size_t)dst) & 0x0f);
                         assert(((size_t)dst & 0x0f) == 0);
                     }
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 8cf36d9eb7c70a..e9a3f0500b1633 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -42,8 +42,8 @@ CONFIG_INTEGER(JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0) // In deb
 CONFIG_INTEGER(JitDefaultFill, W("JitDefaultFill"), 0xdd) // In debug builds, initialize the memory allocated by the nra
                                                           // with this byte.
 CONFIG_INTEGER(JitAlignLoopMinBlockWeight, W("JitAlignLoopMinBlockWeight"), 0xffffffff) // Minimum weight needed of the first block of a loop to trigger its alignment.
-CONFIG_INTEGER(JitAlignLoopMaxInstrCount,
-               W("JitAlignLoopMaxInstrCount"),
+CONFIG_INTEGER(JitAlignLoopMaxCodeSize,
+               W("JitAlignLoopMaxCodeSize"),
                0)  // Maximum code size (in bytes) of the loop after which the the code alignment for that loop will be disabled.
 
 CONFIG_INTEGER(JitDirectAlloc, W("JitDirectAlloc"), 0)

From 0b227a82e0bd9174aad7065d802ca02ca78c5ded Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 16 Oct 2020 16:36:25 -0700
Subject: [PATCH 10/59] JitAlignLoopWith32BPadding

---
 src/coreclr/jit/codegenlinear.cpp |  4 ++++
 src/coreclr/jit/compiler.cpp      |  1 +
 src/coreclr/jit/compiler.h        |  1 +
 src/coreclr/jit/emit.cpp          |  4 ++--
 src/coreclr/jit/emitxarch.cpp     | 35 ++++++++++++++++++++++++++++---
 src/coreclr/jit/jitconfigvalues.h |  3 +++
 6 files changed, 43 insertions(+), 5 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index fd081cc0fd1283..2398c03a9923a3 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -774,6 +774,10 @@ void CodeGen::genCodeForBBlist()
                 printf("Adding 'align' instruction to align loop header block " FMT_BB, block->bbNext->bbNum);
             }
             GetEmitter()->emitLoopAlign();
+            if (compiler->compJitAlignLoopWith32BPadding)
+            {
+                GetEmitter()->emitLoopAlign();
+            }
 
             // Mark this IG as need alignment so during emitter we can check the instruction count heuristics of
             // all IGs that follows this IG and participate in a loop.
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 4ff6b3e0b720c0..1f286b4003c2f9 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2617,6 +2617,7 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
 
     compJitAlignLoopMinBlockWeight = JitConfig.JitAlignLoopMinBlockWeight();
     compJitAlignLoopMaxCodeSize    = JitConfig.JitAlignLoopMaxCodeSize();
+    compJitAlignLoopWith32BPadding = JitConfig.JitAlignLoopWith32BPadding() == 1;
 
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
     // We never want to have debugging enabled when regenerating GC encoding patterns
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 582790cdb11953..d5be185903efe7 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -2244,6 +2244,7 @@ class Compiler
 
     unsigned compJitAlignLoopMinBlockWeight;
     unsigned compJitAlignLoopMaxCodeSize;
+    bool     compJitAlignLoopWith32BPadding;
 
 #ifdef DEBUG
     bool verbose;
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 6786a07d11d6d4..a446170caede0a 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -3649,7 +3649,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
 
     UNATIVE_OFFSET csz = (UNATIVE_OFFSET)(*dp - curInsAdr);
 
-    if (csz != id->idCodeSize())
+    if ((id->idIns() != INS_align) && (csz != id->idCodeSize()))
     {
         /* It is fatal to under-estimate the instruction size */
         noway_assert(id->idCodeSize() >= csz);
@@ -3678,7 +3678,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
 
 #ifdef DEBUG
     /* Make sure the instruction descriptor size also matches our expectations */
-    if (is != emitSizeOfInsDsc(id))
+    if ((id->idIns() != INS_align) && (is != emitSizeOfInsDsc(id)))
     {
         printf("%s at %u: Expected size = %u , actual size = %u\n", emitIfName(id->idInsFmt()),
                id->idDebugOnlyInfo()->idNum, is, emitSizeOfInsDsc(id));
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index d502687266d656..417a959f0e1e53 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12600,6 +12600,17 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             {
                 sz = SMALL_IDSC_SIZE;
 
+                instrDesc* nextId = id;
+                castto(nextId, BYTE*) += sz;
+
+                if (emitComp->compJitAlignLoopWith32BPadding && (nextId->idIns() != INS_align))
+                {
+                    // If 32B was alignment was needed and next instruction is not alignment,
+                    // we already handled this while emitting the previous alignment instruction.
+                    // nothing to do now.
+                    break;
+                }
+
                 // Candidate for loop alignment
                 if (ig->igFlags & IGF_ALIGN_LOOP)
                 {
@@ -12619,8 +12630,26 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     if (totalCodeSize <= emitComp->compJitAlignLoopMaxCodeSize)
                     {
                         //printf("Aligning loop in %s.\n", emitComp->info.compMethodName);
-                        dst = emitOutputNOP(dst, (-(int)(size_t)dst) & 0x0f);
-                        assert(((size_t)dst & 0x0f) == 0);
+                        size_t nBytes = (-(int)(size_t)dst) & 0x0f;
+                        dst = emitOutputNOP(dst, nBytes);
+                        
+                        if (nextId->idIns() == INS_align)
+                        {
+                            // If next instruction is also alignment, this better be 32B padding.
+                            assert(emitComp->compJitAlignLoopWith32BPadding);
+
+                            // Align further to 32B boundary, if it is not yet.
+                            if (((size_t)dst & 0x1f) != 0)
+                            {
+                                dst = emitOutputNOP(dst, 1);
+                                dst = emitOutputNOP(dst, 15);
+                            }
+                            assert(((size_t)dst & 0x1f) == 0);
+                        }
+                        else
+                        {
+                            assert(((size_t)dst & 0x0f) == 0);
+                        }
                     }
                 }
 
@@ -13665,7 +13694,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
     }
 
     // Make sure we set the instruction descriptor size correctly
-    assert(sz == emitSizeOfInsDsc(id));
+    assert((sz == emitSizeOfInsDsc(id)) || (ins == INS_align));
 
 #if !FEATURE_FIXED_OUT_ARGS
     bool updateStackLevel = !emitIGisInProlog(ig) && !emitIGisInEpilog(ig);
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index e9a3f0500b1633..52b9d01d5d5c7c 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -45,6 +45,9 @@ CONFIG_INTEGER(JitAlignLoopMinBlockWeight, W("JitAlignLoopMinBlockWeight"), 0xff
 CONFIG_INTEGER(JitAlignLoopMaxCodeSize,
                W("JitAlignLoopMaxCodeSize"),
                0)  // Maximum code size (in bytes) of the loop after which the the code alignment for that loop will be disabled.
+CONFIG_INTEGER(JitAlignLoopWith32BPadding,
+               W("JitAlignLoopWith32BPadding"),
+               1) // Perform maximum 32B padding for loop alignment. If set false, pads with 16B.
 
 CONFIG_INTEGER(JitDirectAlloc, W("JitDirectAlloc"), 0)
 CONFIG_INTEGER(JitDoubleAlign, W("JitDoubleAlign"), 1)

From a9c976403785732dd9ff8a4c5f375a4543bf9462 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 22 Oct 2020 17:57:58 -0700
Subject: [PATCH 11/59] Add emitLoopAlign32Bytes()

---
 src/coreclr/jit/codegenlinear.cpp |   5 +-
 src/coreclr/jit/emit.cpp          |  18 ++++-
 src/coreclr/jit/emitxarch.cpp     | 124 +++++++++++++++++++++---------
 src/coreclr/jit/emitxarch.h       |   2 +
 4 files changed, 112 insertions(+), 37 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index 2398c03a9923a3..13c4edb169e14f 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -773,8 +773,11 @@ void CodeGen::genCodeForBBlist()
             {
                 printf("Adding 'align' instruction to align loop header block " FMT_BB, block->bbNext->bbNum);
             }
-            GetEmitter()->emitLoopAlign();
             if (compiler->compJitAlignLoopWith32BPadding)
+            {
+                GetEmitter()->emitLoopAlign32Bytes();
+            }
+            else
             {
                 GetEmitter()->emitLoopAlign();
             }
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index a446170caede0a..0e172f4e74f207 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -4795,7 +4795,9 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
     // For x64/x86, align methods that are "optimizations enabled" to 32 byte boundaries if
     // they are larger than 16 bytes and contain a loop.
     //
-    if (emitComp->opts.OptimizationEnabled() && !emitComp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT) &&
+    if (
+        //emitComp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER1) &&
+        emitComp->opts.OptimizationEnabled() && !emitComp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT) &&
         (emitTotalHotCodeSize > 16) && emitComp->fgHasLoops)
     {
         allocMemFlag = CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN;
@@ -5208,7 +5210,21 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
 
         for (unsigned cnt = ig->igInsCnt; cnt; cnt--)
         {
+#ifdef DEBUG
+            int oldCp = ((size_t)cp & 0xf0) >> 4;
+#endif
             castto(id, BYTE*) += emitIssue1Instr(ig, id, &cp);
+#ifdef DEBUG
+            
+            if ((emitComp->opts.disAsm || emitComp->verbose) && emitComp->opts.disAddr)
+            {
+                int newCp = ((size_t)cp & 0xf0) >> 4;
+                if ((oldCp != newCp) && ((newCp % 2) == 0))
+                {
+                    printf("; =========================== 32B boundary ===========================\n");
+                }
+            }
+#endif
         }
 
 #ifdef DEBUG
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 417a959f0e1e53..4d3bc5167a3ec6 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2669,6 +2669,27 @@ void emitter::emitLoopAlign()
     emitCurIGsize += 15;
 }
 
+/*****************************************************************************
+ *
+ *  The next instruction will be a loop head entry point
+ *  So insert a dummy instruction here to ensure that
+ *  the x86 I-cache alignment rule is followed.
+ */
+
+void emitter::emitLoopAlign32Bytes()
+{
+    emitLoopAlign();
+    emitLoopAlign();
+
+    /* Insert a pseudo-instruction to ensure that we align
+       the next instruction properly */
+
+    instrDesc* id = emitNewInstrSmall(EA_1BYTE);
+    id->idIns(INS_align);
+    id->idCodeSize(1); // We may need to skip up to 15 bytes of code
+    emitCurIGsize += 1;
+}
+
 /*****************************************************************************
  *
  *  Add a NOP instruction of the given size.
@@ -12598,58 +12619,91 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             // the loop alignment pseudo instruction
             if (ins == INS_align)
             {
+                // Candidate for loop alignment
+                assert(ig->igFlags & IGF_ALIGN_LOOP);
                 sz = SMALL_IDSC_SIZE;
 
+                if (emitComp->compJitAlignLoopWith32BPadding)
+                {
+                    // If 32B alignment is needed and we are already on 32B boundary
+                    // no need to emit anything.
+                    if (((size_t)dst & 0x1f) == 0)
+                    {
+                        break;
+                    }
+                }
+                else
+                {
+                    // If 16B alignment is needed and we are already on 16B boundary
+                    // no need to emit anything.
+                    if ((((size_t)dst & 0x0f) == 0))
+                    {
+                        break;
+                    }
+                }
+
                 instrDesc* nextId = id;
                 castto(nextId, BYTE*) += sz;
 
-                if (emitComp->compJitAlignLoopWith32BPadding && (nextId->idIns() != INS_align))
+                // If we already know that the code size heuristics won't match,
+                // do not bother checking it again. Same applies for next instruction
+                // if that too is INS_align.
+                if ((id->idCodeSize() == 0))
                 {
-                    // If 32B was alignment was needed and next instruction is not alignment,
-                    // we already handled this while emitting the previous alignment instruction.
-                    // nothing to do now.
+                    if (nextId->idIns() == INS_align)
+                    {
+                        id->idCodeSize(0);
+                    }
                     break;
                 }
 
-                // Candidate for loop alignment
-                if (ig->igFlags & IGF_ALIGN_LOOP)
+                unsigned  totalCodeSize = 0;
+                insGroup* loopHeaderIg  = ig->igNext;
+                for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
                 {
-                    unsigned  totalCodeSize = 0;
-                    insGroup* loopHeaderIg    = ig->igNext;
-                    for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
+                    totalCodeSize += igInLoop->igSize;
+                    if ((igInLoop->igLoopBackEdge == loopHeaderIg) ||
+                        (totalCodeSize > emitComp->compJitAlignLoopMaxCodeSize))
                     {
-                        totalCodeSize += igInLoop->igSize;
-                        if ((igInLoop->igLoopBackEdge == loopHeaderIg) ||
-                            (totalCodeSize > emitComp->compJitAlignLoopMaxCodeSize))
-                        {
-                            break;
-                        }
+                        break;
                     }
+                }
 
-                    // Only align if it matches the heuristics
-                    if (totalCodeSize <= emitComp->compJitAlignLoopMaxCodeSize)
+                // Only align if it matches the heuristics
+                if (totalCodeSize <= emitComp->compJitAlignLoopMaxCodeSize)
+                {
+                    // printf("Aligning loop in %s.\n", emitComp->info.compMethodName);
+                    // TODO: OK if it is close enough to 32B boundary like what if dst is 0xXXXXX21?
+                    // TODO: If the dst + totalCodeSize is within 32B, then no alignment needed.
+                    // TODO: Should we do the alignment only in hotCode and not in coldCodeBlock?
+                    size_t nBytes = (-(int)(size_t)dst) & 0x0f;
+                    dst           = emitOutputNOP(dst, nBytes);
+
+                    if (nextId->idIns() == INS_align)
                     {
-                        //printf("Aligning loop in %s.\n", emitComp->info.compMethodName);
-                        size_t nBytes = (-(int)(size_t)dst) & 0x0f;
-                        dst = emitOutputNOP(dst, nBytes);
-                        
-                        if (nextId->idIns() == INS_align)
-                        {
-                            // If next instruction is also alignment, this better be 32B padding.
-                            assert(emitComp->compJitAlignLoopWith32BPadding);
+                        // If next instruction is also alignment, this better be 32B padding.
+                        assert(emitComp->compJitAlignLoopWith32BPadding);
 
-                            // Align further to 32B boundary, if it is not yet.
-                            if (((size_t)dst & 0x1f) != 0)
-                            {
-                                dst = emitOutputNOP(dst, 1);
-                                dst = emitOutputNOP(dst, 15);
-                            }
-                            assert(((size_t)dst & 0x1f) == 0);
-                        }
-                        else
+                        // Align further to 32B boundary, if it is not yet.
+                        if (((size_t)dst & 0x1f) != 0)
                         {
-                            assert(((size_t)dst & 0x0f) == 0);
+                            dst = emitOutputNOP(dst, 15);
+                            dst = emitOutputNOP(dst, 1);
                         }
+                        assert(((size_t)dst & 0x1f) == 0);
+                    }
+                    else
+                    {
+                        assert(((size_t)dst & 0x0f) == 0);
+                    }
+                }
+                else
+                {
+                    // If next instruction is align, skip it so
+                    // we do not check the heuristics again.
+                    if (nextId->idIns() == INS_align)
+                    {
+                        nextId->idCodeSize(0);
                     }
                 }
 
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index fb2aac2d30f0d9..07ed676dd1f979 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -289,6 +289,8 @@ inline emitAttr emitDecodeScale(unsigned ensz)
 public:
 void emitLoopAlign();
 
+void emitLoopAlign32Bytes();
+
 void emitIns(instruction ins);
 
 void emitIns(instruction ins, emitAttr attr);

From 7f1f787aa6f836f87e26cc06997d8d1ad954b4e7 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 22 Oct 2020 22:58:20 -0700
Subject: [PATCH 12/59] wip

---
 src/coreclr/jit/emit.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 2088ddad1618e3..f1fab43c078f98 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -250,6 +250,7 @@ struct insGroup
     unsigned int   igFuncIdx; // Which function/funclet does this belong to? (Index into Compiler::compFuncInfos array.)
     unsigned short igFlags;   // see IGF_xxx below
     unsigned short igSize;    // # of bytes of code in this group
+    insGroup*      igLoopBackEdge;
 
 #define IGF_GC_VARS 0x0001    // new set of live GC ref variables
 #define IGF_BYREF_REGS 0x0002 // new set of live by-ref registers
@@ -293,7 +294,6 @@ struct insGroup
 #endif
     regMaskSmall  igGCregs; // set of registers with live GC refs
     unsigned char igInsCnt; // # of instructions  in this group
-    insGroup*     igLoopBackEdge;
 
 #else // REGMASK_BITS
 

From aac18a4005d1477fb1c43b18f02715e74328bda7 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Tue, 27 Oct 2020 23:01:42 -0700
Subject: [PATCH 13/59] Add logic to avoid emitting nop if not needed

---
 src/coreclr/jit/codegenlinear.cpp |  3 ++
 src/coreclr/jit/emit.cpp          | 12 +++++-
 src/coreclr/jit/emitxarch.cpp     | 61 ++++++++++++++++++++++---------
 3 files changed, 57 insertions(+), 19 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index 13c4edb169e14f..89a92538fa4626 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -775,7 +775,10 @@ void CodeGen::genCodeForBBlist()
             }
             if (compiler->compJitAlignLoopWith32BPadding)
             {
+#if defined(TARGET_XARCH)
+                //TODO: Only do this if we are confident that the loop size doesn't exceed the heuristics threshold
                 GetEmitter()->emitLoopAlign32Bytes();
+#endif
             }
             else
             {
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 0e172f4e74f207..d35923925ce282 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -3649,7 +3649,11 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
 
     UNATIVE_OFFSET csz = (UNATIVE_OFFSET)(*dp - curInsAdr);
 
-    if ((id->idIns() != INS_align) && (csz != id->idCodeSize()))
+    if (
+#if defined(TARGET_XARCH)
+        (id->idIns() != INS_align) &&
+#endif
+        (csz != id->idCodeSize()))
     {
         /* It is fatal to under-estimate the instruction size */
         noway_assert(id->idCodeSize() >= csz);
@@ -3678,7 +3682,11 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
 
 #ifdef DEBUG
     /* Make sure the instruction descriptor size also matches our expectations */
-    if ((id->idIns() != INS_align) && (is != emitSizeOfInsDsc(id)))
+    if (
+#if defined(TARGET_XARCH)
+        (id->idIns() != INS_align) &&
+#endif
+        (is != emitSizeOfInsDsc(id)))
     {
         printf("%s at %u: Expected size = %u , actual size = %u\n", emitIfName(id->idInsFmt()),
                id->idDebugOnlyInfo()->idNum, is, emitSizeOfInsDsc(id));
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 4d3bc5167a3ec6..3d3043362389a3 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12652,7 +12652,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 {
                     if (nextId->idIns() == INS_align)
                     {
-                        id->idCodeSize(0);
+                        nextId->idCodeSize(0);
                     }
                     break;
                 }
@@ -12669,36 +12669,63 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     }
                 }
 
+                bool ignoreNextAlignIns = false;
+                unsigned alignmentBoundary  = emitComp->compJitAlignLoopWith32BPadding ? 32 : 16;
                 // Only align if it matches the heuristics
                 if (totalCodeSize <= emitComp->compJitAlignLoopMaxCodeSize)
                 {
-                    // printf("Aligning loop in %s.\n", emitComp->info.compMethodName);
-                    // TODO: OK if it is close enough to 32B boundary like what if dst is 0xXXXXX21?
-                    // TODO: If the dst + totalCodeSize is within 32B, then no alignment needed.
-                    // TODO: Should we do the alignment only in hotCode and not in coldCodeBlock?
-                    size_t nBytes = (-(int)(size_t)dst) & 0x0f;
-                    dst           = emitOutputNOP(dst, nBytes);
-
-                    if (nextId->idIns() == INS_align)
+                    int    minimumBlocksNeeded = (totalCodeSize + alignmentBoundary - 1) / alignmentBoundary;
+                    int    extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - totalCodeSize;
+                    size_t currentOffset       = (size_t)dst % alignmentBoundary;
+
+                    // If current offset is less than extra bytes that are not in loop,
+                    // then no need of alignment as the entire loop body will fit in
+                    // minimumBlocksNeeded.
+                    if (currentOffset < extraBytesNotInLoop)
                     {
-                        // If next instruction is also alignment, this better be 32B padding.
-                        assert(emitComp->compJitAlignLoopWith32BPadding);
+                        if (emitComp->opts.disAsm)
+                        {
+                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset < extraBytesNotInLoop) ~~~~~~~~~~~~~~~~~~~~~~\n");
+                        }
 
-                        // Align further to 32B boundary, if it is not yet.
-                        if (((size_t)dst & 0x1f) != 0)
+                        if (nextId->idIns() == INS_align)
                         {
-                            dst = emitOutputNOP(dst, 15);
-                            dst = emitOutputNOP(dst, 1);
+                            nextId->idCodeSize(0);
                         }
-                        assert(((size_t)dst & 0x1f) == 0);
                     }
                     else
                     {
-                        assert(((size_t)dst & 0x0f) == 0);
+                        // printf("Aligning loop in %s.\n", emitComp->info.compMethodName);
+                        // TODO: OK if it is close enough to 32B boundary like what if dst is 0xXXXXX21?
+                        // TODO: If the dst + totalCodeSize is within 32B, then no alignment needed.
+                        // TODO: Should we do the alignment only in hotCode and not in coldCodeBlock?
+                        size_t nBytes = (-(int)(size_t)dst) & 0x0f;
+                        dst           = emitOutputNOP(dst, nBytes);
+
+                        if (nextId->idIns() == INS_align)
+                        {
+                            // If next instruction is also alignment, this better be 32B padding.
+                            assert(emitComp->compJitAlignLoopWith32BPadding);
+
+                            // Align further to 32B boundary, if it is not yet.
+                            if (((size_t)dst & 0x1f) != 0)
+                            {
+                                dst = emitOutputNOP(dst, 15);
+                                dst = emitOutputNOP(dst, 1);
+                            }
+                        }
+
+                        // In the end dst should be at alignment boundary
+                        assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
                     }
                 }
                 else
                 {
+                    if (emitComp->opts.disAsm)
+                    {
+                        printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (totalCodeSize <= "
+                               "emitComp->compJitAlignLoopMaxCodeSize) ~~~~~~~~~~~~~~~~~~~~~~\n");
+                    }
                     // If next instruction is align, skip it so
                     // we do not check the heuristics again.
                     if (nextId->idIns() == INS_align)

From 5c3c40e37badaa60bde7c9989a8b9e4e098587a6 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 28 Oct 2020 14:57:36 -0700
Subject: [PATCH 14/59] fix a condition

---
 src/coreclr/jit/emitxarch.cpp | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 3d3043362389a3..880cbb53c3c54d 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12652,6 +12652,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 {
                     if (nextId->idIns() == INS_align)
                     {
+                        assert(emitComp->compJitAlignLoopWith32BPadding);
                         nextId->idCodeSize(0);
                     }
                     break;
@@ -12681,15 +12682,16 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     // If current offset is less than extra bytes that are not in loop,
                     // then no need of alignment as the entire loop body will fit in
                     // minimumBlocksNeeded.
-                    if (currentOffset < extraBytesNotInLoop)
+                    if (currentOffset <= extraBytesNotInLoop)
                     {
                         if (emitComp->opts.disAsm)
                         {
-                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset < extraBytesNotInLoop) ~~~~~~~~~~~~~~~~~~~~~~\n");
+                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset <= extraBytesNotInLoop) in (%s)\n", emitComp->info.compMethodName);
                         }
 
                         if (nextId->idIns() == INS_align)
                         {
+                            assert(emitComp->compJitAlignLoopWith32BPadding);
                             nextId->idCodeSize(0);
                         }
                     }
@@ -12712,9 +12714,17 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                             {
                                 dst = emitOutputNOP(dst, 15);
                                 dst = emitOutputNOP(dst, 1);
+#if DEBUG
+                                nBytes += 16;
+#endif
                             }
                         }
 
+                        if (emitComp->opts.disAsm)
+                        {
+                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ alignment= %d bytes, loopsize= %d bytes, minBlocksNeeded= %d, extraBytesNotInLoop= %d in (%s)\n", nBytes, totalCodeSize, minimumBlocksNeeded, extraBytesNotInLoop, emitComp->info.compMethodName);
+                        }
+
                         // In the end dst should be at alignment boundary
                         assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
                     }
@@ -12724,12 +12734,14 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     if (emitComp->opts.disAsm)
                     {
                         printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (totalCodeSize <= "
-                               "emitComp->compJitAlignLoopMaxCodeSize) ~~~~~~~~~~~~~~~~~~~~~~\n");
+                               "emitComp->compJitAlignLoopMaxCodeSize) in (%s)\n",
+                               emitComp->info.compMethodName);
                     }
                     // If next instruction is align, skip it so
                     // we do not check the heuristics again.
                     if (nextId->idIns() == INS_align)
                     {
+                        assert(emitComp->compJitAlignLoopWith32BPadding);
                         nextId->idCodeSize(0);
                     }
                 }

From fb91d419008ae45368e6ad53239375882daeb603 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 30 Oct 2020 00:39:54 -0700
Subject: [PATCH 15/59] Several things:

- Replaced JitAlignLoopWith32BPadding with JitAlignLoopBoundary
- Added JitAlignLoopForJcc
- Added logging of boundary and point where instruction splitting happpens
- Add logic to take into consideration JCC.
---
 src/coreclr/jit/codegenlinear.cpp |  4 +-
 src/coreclr/jit/compiler.cpp      |  3 +-
 src/coreclr/jit/compiler.h        |  3 +-
 src/coreclr/jit/emit.cpp          | 37 +++++++++++--
 src/coreclr/jit/emitxarch.cpp     | 91 ++++++++++++++++---------------
 src/coreclr/jit/emitxarch.h       |  2 +-
 src/coreclr/jit/jitconfigvalues.h |  9 ++-
 7 files changed, 93 insertions(+), 56 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index 89a92538fa4626..c5298efc582a37 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -773,11 +773,11 @@ void CodeGen::genCodeForBBlist()
             {
                 printf("Adding 'align' instruction to align loop header block " FMT_BB, block->bbNext->bbNum);
             }
-            if (compiler->compJitAlignLoopWith32BPadding)
+            if (compiler->compJitAlignLoopBoundary > 16)
             {
 #if defined(TARGET_XARCH)
                 //TODO: Only do this if we are confident that the loop size doesn't exceed the heuristics threshold
-                GetEmitter()->emitLoopAlign32Bytes();
+                GetEmitter()->emitVariableLoopAlign();
 #endif
             }
             else
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 1f286b4003c2f9..3a3c1701017808 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2617,7 +2617,8 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
 
     compJitAlignLoopMinBlockWeight = JitConfig.JitAlignLoopMinBlockWeight();
     compJitAlignLoopMaxCodeSize    = JitConfig.JitAlignLoopMaxCodeSize();
-    compJitAlignLoopWith32BPadding = JitConfig.JitAlignLoopWith32BPadding() == 1;
+    compJitAlignLoopBoundary       = ReinterpretHexAsDecimal(JitConfig.JitAlignLoopBoundary());
+    compJitAlignLoopForJcc         = JitConfig.JitAlignLoopForJcc() == 1;
 
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
     // We never want to have debugging enabled when regenerating GC encoding patterns
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index d5be185903efe7..4021da725b4104 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -2244,7 +2244,8 @@ class Compiler
 
     unsigned compJitAlignLoopMinBlockWeight;
     unsigned compJitAlignLoopMaxCodeSize;
-    bool     compJitAlignLoopWith32BPadding;
+    unsigned compJitAlignLoopBoundary;
+    bool     compJitAlignLoopForJcc;
 
 #ifdef DEBUG
     bool verbose;
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index d35923925ce282..3682a03e51d64e 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -5219,17 +5219,46 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
         for (unsigned cnt = ig->igInsCnt; cnt; cnt--)
         {
 #ifdef DEBUG
-            int oldCp = ((size_t)cp & 0xf0) >> 4;
+            size_t lastCp = (size_t) cp;
+            instrDesc* lastId = id;
 #endif
             castto(id, BYTE*) += emitIssue1Instr(ig, id, &cp);
 #ifdef DEBUG
             
             if ((emitComp->opts.disAsm || emitComp->verbose) && emitComp->opts.disAddr)
             {
-                int newCp = ((size_t)cp & 0xf0) >> 4;
-                if ((oldCp != newCp) && ((newCp % 2) == 0))
+                size_t lastBoundaryAddr = (size_t)cp & ~((size_t)emitComp->compJitAlignLoopBoundary - 1);
+
+                // draw boundary if lastCp was before the lastBoundary.
+                if (lastCp < lastBoundaryAddr)
                 {
-                    printf("; =========================== 32B boundary ===========================\n");
+                    printf("; ");
+                    instruction lastIns = lastId->idIns();
+
+                    // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
+                    bool isJccAffectedIns = ((lastIns >= INS_i_jmp && lastIns < INS_align) || (lastIns == INS_call) ||
+                                            (lastIns == INS_ret));
+                    if (cnt)
+                    {
+                        instruction currIns = id->idIns();
+                        if ((lastIns == INS_cmp) || (lastIns == INS_test) || (lastIns == INS_add) || (lastIns == INS_sub) ||
+                            (lastIns == INS_and) || (lastIns == INS_inc) || (lastIns == INS_dec))
+                        {
+                            isJccAffectedIns |= (currIns >= INS_i_jmp && currIns < INS_align);
+                        }
+                    }
+
+                    // Indicate if instruction is at or split at 32B boundary
+                    unsigned bytesCrossedBoundary = ((size_t)cp & 0x1f);
+                    if ((bytesCrossedBoundary != 0) || (isJccAffectedIns && bytesCrossedBoundary == 0))
+                    {
+                        printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", codeGen->genInsName(lastId->idIns()), bytesCrossedBoundary);
+                    }
+                    else
+                    {
+                        printf("...............................");
+                    }
+                    printf(" %dB boundary ...............................\n", (emitComp->compJitAlignLoopBoundary));
                 }
             }
 #endif
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 880cbb53c3c54d..d0a1cf2a101c29 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2676,18 +2676,27 @@ void emitter::emitLoopAlign()
  *  the x86 I-cache alignment rule is followed.
  */
 
-void emitter::emitLoopAlign32Bytes()
+void emitter::emitVariableLoopAlign()
 {
-    emitLoopAlign();
-    emitLoopAlign();
+    unsigned insAlignCount = (emitComp->compJitAlignLoopBoundary - 1) / 15;
+    unsigned lastInsAlignSize = (emitComp->compJitAlignLoopBoundary - 1) % 15;
+
+    while (insAlignCount)
+    {
+        emitLoopAlign();
+        insAlignCount--;
+    }
 
     /* Insert a pseudo-instruction to ensure that we align
        the next instruction properly */
 
-    instrDesc* id = emitNewInstrSmall(EA_1BYTE);
-    id->idIns(INS_align);
-    id->idCodeSize(1); // We may need to skip up to 15 bytes of code
-    emitCurIGsize += 1;
+    if (lastInsAlignSize > 0)
+    {
+        instrDesc* id = emitNewInstrSmall(EA_1BYTE);
+        id->idIns(INS_align);
+        id->idCodeSize(lastInsAlignSize);
+        emitCurIGsize += lastInsAlignSize;
+    }
 }
 
 /*****************************************************************************
@@ -12619,27 +12628,16 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             // the loop alignment pseudo instruction
             if (ins == INS_align)
             {
+                unsigned alignmentBoundary = emitComp->compJitAlignLoopBoundary;
+
                 // Candidate for loop alignment
                 assert(ig->igFlags & IGF_ALIGN_LOOP);
                 sz = SMALL_IDSC_SIZE;
 
-                if (emitComp->compJitAlignLoopWith32BPadding)
+                // If already at alignment boundary, no need to emit anything.
+                if (((size_t)dst & (alignmentBoundary - 1)) == 0)
                 {
-                    // If 32B alignment is needed and we are already on 32B boundary
-                    // no need to emit anything.
-                    if (((size_t)dst & 0x1f) == 0)
-                    {
-                        break;
-                    }
-                }
-                else
-                {
-                    // If 16B alignment is needed and we are already on 16B boundary
-                    // no need to emit anything.
-                    if ((((size_t)dst & 0x0f) == 0))
-                    {
-                        break;
-                    }
+                    break;
                 }
 
                 instrDesc* nextId = id;
@@ -12652,62 +12650,67 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 {
                     if (nextId->idIns() == INS_align)
                     {
-                        assert(emitComp->compJitAlignLoopWith32BPadding);
+                        assert(emitComp->compJitAlignLoopBoundary > 16);
                         nextId->idCodeSize(0);
                     }
                     break;
                 }
 
-                unsigned  totalCodeSize = 0;
+                unsigned  loopSize = 0;
                 insGroup* loopHeaderIg  = ig->igNext;
                 for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
                 {
-                    totalCodeSize += igInLoop->igSize;
+                    loopSize += igInLoop->igSize;
                     if ((igInLoop->igLoopBackEdge == loopHeaderIg) ||
-                        (totalCodeSize > emitComp->compJitAlignLoopMaxCodeSize))
+                        (loopSize > emitComp->compJitAlignLoopMaxCodeSize))
                     {
                         break;
                     }
                 }
 
                 bool ignoreNextAlignIns = false;
-                unsigned alignmentBoundary  = emitComp->compJitAlignLoopWith32BPadding ? 32 : 16;
+
                 // Only align if it matches the heuristics
-                if (totalCodeSize <= emitComp->compJitAlignLoopMaxCodeSize)
+                if (loopSize <= emitComp->compJitAlignLoopMaxCodeSize)
                 {
-                    int    minimumBlocksNeeded = (totalCodeSize + alignmentBoundary - 1) / alignmentBoundary;
-                    int    extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - totalCodeSize;
+                    int    minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+                    int    extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
                     size_t currentOffset       = (size_t)dst % alignmentBoundary;
 
-                    // If current offset is less than extra bytes that are not in loop,
-                    // then no need of alignment as the entire loop body will fit in
-                    // minimumBlocksNeeded.
+                    // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary
+                    if (emitComp->compJitAlignLoopForJcc)
+                    {
+                        // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing?
+                        currentOffset++;
+                    }
+
+                    //TODO: Add a switch for JCC
+                    //TODO: Revisit nop sequence we emit in case of 31 bytes
+
+                    // Padding is needed only if loop starts at or after the current offset.
+                    // Otherwise, the loop just fits in minimumBlocksNeeded and so no alignment needed.
                     if (currentOffset <= extraBytesNotInLoop)
                     {
                         if (emitComp->opts.disAsm)
                         {
-                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset <= extraBytesNotInLoop) in (%s)\n", emitComp->info.compMethodName);
+                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset < extraBytesNotInLoop) in (%s)\n", emitComp->info.compMethodName);
                         }
 
                         if (nextId->idIns() == INS_align)
                         {
-                            assert(emitComp->compJitAlignLoopWith32BPadding);
+                            assert(emitComp->compJitAlignLoopBoundary > 16);
                             nextId->idCodeSize(0);
                         }
                     }
                     else
                     {
-                        // printf("Aligning loop in %s.\n", emitComp->info.compMethodName);
-                        // TODO: OK if it is close enough to 32B boundary like what if dst is 0xXXXXX21?
-                        // TODO: If the dst + totalCodeSize is within 32B, then no alignment needed.
-                        // TODO: Should we do the alignment only in hotCode and not in coldCodeBlock?
                         size_t nBytes = (-(int)(size_t)dst) & 0x0f;
                         dst           = emitOutputNOP(dst, nBytes);
 
                         if (nextId->idIns() == INS_align)
                         {
                             // If next instruction is also alignment, this better be 32B padding.
-                            assert(emitComp->compJitAlignLoopWith32BPadding);
+                            assert(emitComp->compJitAlignLoopBoundary > 16);
 
                             // Align further to 32B boundary, if it is not yet.
                             if (((size_t)dst & 0x1f) != 0)
@@ -12722,7 +12725,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 
                         if (emitComp->opts.disAsm)
                         {
-                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ alignment= %d bytes, loopsize= %d bytes, minBlocksNeeded= %d, extraBytesNotInLoop= %d in (%s)\n", nBytes, totalCodeSize, minimumBlocksNeeded, extraBytesNotInLoop, emitComp->info.compMethodName);
+                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ alignment= %d bytes, loopsize= %d bytes, minBlocksNeeded= %d, extraBytesNotInLoop= %d in (%s)\n", nBytes, loopSize, minimumBlocksNeeded, extraBytesNotInLoop, emitComp->info.compMethodName);
                         }
 
                         // In the end dst should be at alignment boundary
@@ -12733,7 +12736,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 {
                     if (emitComp->opts.disAsm)
                     {
-                        printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (totalCodeSize <= "
+                        printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (loopSize <= "
                                "emitComp->compJitAlignLoopMaxCodeSize) in (%s)\n",
                                emitComp->info.compMethodName);
                     }
@@ -12741,7 +12744,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     // we do not check the heuristics again.
                     if (nextId->idIns() == INS_align)
                     {
-                        assert(emitComp->compJitAlignLoopWith32BPadding);
+                        assert(emitComp->compJitAlignLoopBoundary > 16);
                         nextId->idCodeSize(0);
                     }
                 }
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 07ed676dd1f979..409cfa442a689d 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -289,7 +289,7 @@ inline emitAttr emitDecodeScale(unsigned ensz)
 public:
 void emitLoopAlign();
 
-void emitLoopAlign32Bytes();
+void emitVariableLoopAlign();
 
 void emitIns(instruction ins);
 
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 52b9d01d5d5c7c..72781030666d73 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -45,9 +45,12 @@ CONFIG_INTEGER(JitAlignLoopMinBlockWeight, W("JitAlignLoopMinBlockWeight"), 0xff
 CONFIG_INTEGER(JitAlignLoopMaxCodeSize,
                W("JitAlignLoopMaxCodeSize"),
                0)  // Maximum code size (in bytes) of the loop after which the the code alignment for that loop will be disabled.
-CONFIG_INTEGER(JitAlignLoopWith32BPadding,
-               W("JitAlignLoopWith32BPadding"),
-               1) // Perform maximum 32B padding for loop alignment. If set false, pads with 16B.
+CONFIG_INTEGER(JitAlignLoopBoundary,
+               W("JitAlignLoopBoundary"),
+               0x32) // Boundary (multiples of 2) at which inner loops should be aliged. By default, it is set to 32B.
+CONFIG_INTEGER(JitAlignLoopForJcc,
+               W("JitAlignLoopForJcc"),
+               0) // If set, while doing loop alignment, ensure loop jmps don't cross alignment boundary.
 
 CONFIG_INTEGER(JitDirectAlloc, W("JitDirectAlloc"), 0)
 CONFIG_INTEGER(JitDoubleAlign, W("JitDoubleAlign"), 1)

From 0b8eb7844f5ba204f1611e634f7ad28defa5ba56 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 5 Nov 2020 10:44:36 -0800
Subject: [PATCH 16/59] Added JitAlignLoopAdaptive algorithm

---
 src/coreclr/jit/codegenlinear.cpp |   2 +-
 src/coreclr/jit/compiler.cpp      |   4 +
 src/coreclr/jit/compiler.h        |   1 +
 src/coreclr/jit/emit.cpp          |   4 +
 src/coreclr/jit/emitxarch.cpp     | 260 +++++++++++++++++++++---------
 src/coreclr/jit/jitconfigvalues.h |   8 +-
 6 files changed, 200 insertions(+), 79 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index c5298efc582a37..eea946afd2910b 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -773,7 +773,7 @@ void CodeGen::genCodeForBBlist()
             {
                 printf("Adding 'align' instruction to align loop header block " FMT_BB, block->bbNext->bbNum);
             }
-            if (compiler->compJitAlignLoopBoundary > 16)
+            if ((compiler->compJitAlignLoopBoundary > 16) && (!compiler->compJitAlignLoopAdaptive))
             {
 #if defined(TARGET_XARCH)
                 //TODO: Only do this if we are confident that the loop size doesn't exceed the heuristics threshold
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 3a3c1701017808..0c04cfe5d02295 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2619,6 +2619,9 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     compJitAlignLoopMaxCodeSize    = JitConfig.JitAlignLoopMaxCodeSize();
     compJitAlignLoopBoundary       = ReinterpretHexAsDecimal(JitConfig.JitAlignLoopBoundary());
     compJitAlignLoopForJcc         = JitConfig.JitAlignLoopForJcc() == 1;
+    compJitAlignLoopAdaptive       = JitConfig.JitAlignLoopAdaptive() == 1;
+    assert(isPow2(compJitAlignLoopBoundary));
+
 
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
     // We never want to have debugging enabled when regenerating GC encoding patterns
@@ -3931,6 +3934,7 @@ void Compiler::compSetOptimizationLevel()
         else
         {
             codeGen->SetAlignLoops(opts.jitFlags->IsSet(JitFlags::JIT_FLAG_ALIGN_LOOPS));
+            //codeGen->SetAlignLoops(true);
         }
     }
 
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 4021da725b4104..32fd8db70f87f8 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -2246,6 +2246,7 @@ class Compiler
     unsigned compJitAlignLoopMaxCodeSize;
     unsigned compJitAlignLoopBoundary;
     bool     compJitAlignLoopForJcc;
+    bool     compJitAlignLoopAdaptive;
 
 #ifdef DEBUG
     bool verbose;
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 3682a03e51d64e..9a3f2740d95051 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -5235,6 +5235,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
                     printf("; ");
                     instruction lastIns = lastId->idIns();
 
+#if defined(TARGET_XARCH)
                     // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
                     bool isJccAffectedIns = ((lastIns >= INS_i_jmp && lastIns < INS_align) || (lastIns == INS_call) ||
                                             (lastIns == INS_ret));
@@ -5247,6 +5248,9 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
                             isJccAffectedIns |= (currIns >= INS_i_jmp && currIns < INS_align);
                         }
                     }
+#else
+                    bool isJccAffectedIns = false;
+#endif
 
                     // Indicate if instruction is at or split at 32B boundary
                     unsigned bytesCrossedBoundary = ((size_t)cp & 0x1f);
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index d0a1cf2a101c29..185b25ecd89469 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12628,11 +12628,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             // the loop alignment pseudo instruction
             if (ins == INS_align)
             {
-                unsigned alignmentBoundary = emitComp->compJitAlignLoopBoundary;
-
                 // Candidate for loop alignment
+                assert(codeGen->ShouldAlignLoops());
                 assert(ig->igFlags & IGF_ALIGN_LOOP);
-                sz = SMALL_IDSC_SIZE;
+                unsigned alignmentBoundary = emitComp->compJitAlignLoopBoundary;
+                sz                         = SMALL_IDSC_SIZE;
 
                 // If already at alignment boundary, no need to emit anything.
                 if (((size_t)dst & (alignmentBoundary - 1)) == 0)
@@ -12640,112 +12640,220 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     break;
                 }
 
-                instrDesc* nextId = id;
-                castto(nextId, BYTE*) += sz;
-
-                // If we already know that the code size heuristics won't match,
-                // do not bother checking it again. Same applies for next instruction
-                // if that too is INS_align.
-                if ((id->idCodeSize() == 0))
+                if (emitComp->compJitAlignLoopAdaptive)
                 {
-                    if (nextId->idIns() == INS_align)
+                    // calculate the loop size
+                    unsigned  loopSize     = 0;
+                    insGroup* loopHeaderIg = ig->igNext;
+                    for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
                     {
-                        assert(emitComp->compJitAlignLoopBoundary > 16);
-                        nextId->idCodeSize(0);
+                        loopSize += igInLoop->igSize;
+                        if ((igInLoop->igLoopBackEdge == loopHeaderIg))
+                        {
+                            break;
+                        }
                     }
-                    break;
-                }
 
-                unsigned  loopSize = 0;
-                insGroup* loopHeaderIg  = ig->igNext;
-                for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
-                {
-                    loopSize += igInLoop->igSize;
-                    if ((igInLoop->igLoopBackEdge == loopHeaderIg) ||
-                        (loopSize > emitComp->compJitAlignLoopMaxCodeSize))
+                    // Start to align on 32B boundary with a fallback to 16B boundary
+                    alignmentBoundary                = 32;
+                    int      minBlocksNeededForLoop  = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+                    int      maxBlocksAllowedForLoop = genLog2(alignmentBoundary);
+                    unsigned nMaxPaddingBytes        = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop)) - 1;
+                    unsigned nPaddingBytes           = (-(int)(size_t)dst) & (alignmentBoundary - 1);
+                    bool     skipPadding             = false;
+
+                    if (minBlocksNeededForLoop > maxBlocksAllowedForLoop)
                     {
-                        break;
+                        skipPadding = true;
+
+                        if (emitComp->opts.disAsm)
+                        {
+                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because maxBlocksAllowed. loopSize= %d, "
+                                   "minBlocksNeededForLoop= %d, alignmentBoundary= %d, nPaddingBytes= %d, "
+                                   "nMaxPaddingBytes= %d in (%s)\n",
+                                   loopSize, minBlocksNeededForLoop, alignmentBoundary, nPaddingBytes, nMaxPaddingBytes,
+                                   emitComp->info.compMethodName);
+                        }
                     }
-                }
+                    else if (nPaddingBytes > nMaxPaddingBytes)
+                    {
+                        // Now try to align to 16B boundary
+                        alignmentBoundary = 16;
+                        nMaxPaddingBytes  = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop);
+                        nPaddingBytes     = (-(int)(size_t)dst) & (alignmentBoundary - 1);
 
-                bool ignoreNextAlignIns = false;
+                        if (nPaddingBytes > nMaxPaddingBytes)
+                        {
+                            skipPadding = true;
 
-                // Only align if it matches the heuristics
-                if (loopSize <= emitComp->compJitAlignLoopMaxCodeSize)
-                {
-                    int    minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
-                    int    extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
-                    size_t currentOffset       = (size_t)dst % alignmentBoundary;
+                            if (emitComp->opts.disAsm)
+                            {
+                                printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because maxPaddingAllowed. loopSize= %d, "
+                                       "minBlocksNeededForLoop= %d, alignmentBoundary= %d, nPaddingBytes= %d, "
+                                       "nMaxPaddingBytes= %d in (%s)\n",
+                                       loopSize, minBlocksNeededForLoop, alignmentBoundary, nPaddingBytes,
+                                       nMaxPaddingBytes, emitComp->info.compMethodName);
+                            }
+                        }
+                    }
 
-                    // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary
-                    if (emitComp->compJitAlignLoopForJcc)
+                    if (!skipPadding && (nPaddingBytes > 0))
                     {
-                        // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing?
-                        currentOffset++;
-                    }
+                        int    extraBytesNotInLoop = (32 * minBlocksNeededForLoop) - loopSize; // Still have it at alignmentboundary=32
+                        size_t currentOffset       = (size_t)dst % alignmentBoundary;
 
-                    //TODO: Add a switch for JCC
-                    //TODO: Revisit nop sequence we emit in case of 31 bytes
+                        // Padding is needed only if loop starts at or after the current offset.
+                        // Otherwise, the loop just fits in minBlocksNeededForLoop and so no alignment needed.
+                        if (currentOffset <= extraBytesNotInLoop)
+                        {
+                            skipPadding = true;
 
-                    // Padding is needed only if loop starts at or after the current offset.
-                    // Otherwise, the loop just fits in minimumBlocksNeeded and so no alignment needed.
-                    if (currentOffset <= extraBytesNotInLoop)
-                    {
-                        if (emitComp->opts.disAsm)
+                            if (emitComp->opts.disAsm)
+                            {
+                                printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset < "
+                                       "extraBytesNotInLoop) in (%s)\n",
+                                       emitComp->info.compMethodName);
+                            }
+                        }
+                        else
                         {
-                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset < extraBytesNotInLoop) in (%s)\n", emitComp->info.compMethodName);
+                            dst = emitOutputNOP(dst, nPaddingBytes);
+                            assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
+
+                            if (emitComp->opts.disAsm)
+                            {
+                                printf("; ~~~~~~~~~~~~~~~~~~~~~~ alignment= %d bytes, loopsize= %d bytes, "
+                                       "minBlocksNeeded= %d, extraBytesNotInLoop= %d, alignmentBoundary= %dB in (%s)\n",
+                                       nPaddingBytes, loopSize, minBlocksNeededForLoop, extraBytesNotInLoop, alignmentBoundary,
+                                       emitComp->info.compMethodName);
+                            }
                         }
+                    }
 
+                    // Update the code size of id
+                    if (skipPadding)
+                    {
+                        id->idCodeSize(0);
+                        ig->igFlags |= IGF_UPD_ISZ;
+                    }
+                    else if(nPaddingBytes != id->idCodeSize())
+                    {
+                        id->idCodeSize(nPaddingBytes);
+                        ig->igFlags |= IGF_UPD_ISZ;
+                    }
+                }
+                else
+                {
+                    instrDesc* nextId = id;
+                    castto(nextId, BYTE*) += sz;
+
+                    // If we already know that the code size heuristics won't match,
+                    // do not bother checking it again. Same applies for next instruction
+                    // if that too is INS_align.
+                    if ((id->idCodeSize() == 0))
+                    {
                         if (nextId->idIns() == INS_align)
                         {
                             assert(emitComp->compJitAlignLoopBoundary > 16);
                             nextId->idCodeSize(0);
                         }
+                        break;
+                    }
+
+                    unsigned  loopSize     = 0;
+                    insGroup* loopHeaderIg = ig->igNext;
+                    for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
+                    {
+                        loopSize += igInLoop->igSize;
+                        if ((igInLoop->igLoopBackEdge == loopHeaderIg) ||
+                            (loopSize > emitComp->compJitAlignLoopMaxCodeSize))
+                        {
+                            break;
+                        }
                     }
-                    else
+
+                    // Only align if it matches the heuristics
+                    if (loopSize <= emitComp->compJitAlignLoopMaxCodeSize)
                     {
-                        size_t nBytes = (-(int)(size_t)dst) & 0x0f;
-                        dst           = emitOutputNOP(dst, nBytes);
+                        int    minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+                        int    extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
+                        size_t currentOffset       = (size_t)dst % alignmentBoundary;
 
-                        if (nextId->idIns() == INS_align)
+                        // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary
+                        if (emitComp->compJitAlignLoopForJcc)
                         {
-                            // If next instruction is also alignment, this better be 32B padding.
-                            assert(emitComp->compJitAlignLoopBoundary > 16);
+                            // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing?
+                            currentOffset++;
+                        }
+
+                        // TODO: Add some kind of maxLoopPadding to something smaller so we don't see lot of pads added?
+                        // TODO: Revisit nop sequence we emit in case of 31 bytes
 
-                            // Align further to 32B boundary, if it is not yet.
-                            if (((size_t)dst & 0x1f) != 0)
+                        // Padding is needed only if loop starts at or after the current offset.
+                        // Otherwise, the loop just fits in minBlocksNeededForLoop and so no alignment needed.
+                        if (currentOffset <= extraBytesNotInLoop)
+                        {
+                            if (emitComp->opts.disAsm)
                             {
-                                dst = emitOutputNOP(dst, 15);
-                                dst = emitOutputNOP(dst, 1);
+                                printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset < "
+                                       "extraBytesNotInLoop) in (%s)\n",
+                                       emitComp->info.compMethodName);
+                            }
+
+                            if (nextId->idIns() == INS_align)
+                            {
+                                assert(emitComp->compJitAlignLoopBoundary > 16);
+                                nextId->idCodeSize(0);
+                            }
+                        }
+                        else
+                        {
+                            size_t nBytes = (-(int)(size_t)dst) & 0x0f;
+                            dst           = emitOutputNOP(dst, nBytes);
+
+                            if (nextId->idIns() == INS_align)
+                            {
+                                // If next instruction is also alignment, this better be 32B padding.
+                                assert(emitComp->compJitAlignLoopBoundary > 16);
+
+                                // Align further to 32B boundary, if it is not yet.
+                                if (((size_t)dst & 0x1f) != 0)
+                                {
+                                    dst = emitOutputNOP(dst, 15);
+                                    dst = emitOutputNOP(dst, 1);
 #if DEBUG
-                                nBytes += 16;
+                                    nBytes += 16;
 #endif
+                                }
                             }
-                        }
 
-                        if (emitComp->opts.disAsm)
-                        {
-                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ alignment= %d bytes, loopsize= %d bytes, minBlocksNeeded= %d, extraBytesNotInLoop= %d in (%s)\n", nBytes, loopSize, minimumBlocksNeeded, extraBytesNotInLoop, emitComp->info.compMethodName);
-                        }
+                            if (emitComp->opts.disAsm)
+                            {
+                                printf("; ~~~~~~~~~~~~~~~~~~~~~~ alignment= %d bytes, loopsize= %d bytes, "
+                                       "minBlocksNeeded= %d, extraBytesNotInLoop= %d in (%s)\n",
+                                       nBytes, loopSize, minimumBlocksNeeded, extraBytesNotInLoop,
+                                       emitComp->info.compMethodName);
+                            }
 
-                        // In the end dst should be at alignment boundary
-                        assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
-                    }
-                }
-                else
-                {
-                    if (emitComp->opts.disAsm)
-                    {
-                        printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (loopSize <= "
-                               "emitComp->compJitAlignLoopMaxCodeSize) in (%s)\n",
-                               emitComp->info.compMethodName);
+                            // In the end dst should be at alignment boundary
+                            assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
+                        }
                     }
-                    // If next instruction is align, skip it so
-                    // we do not check the heuristics again.
-                    if (nextId->idIns() == INS_align)
+                    else
                     {
-                        assert(emitComp->compJitAlignLoopBoundary > 16);
-                        nextId->idCodeSize(0);
+                        if (emitComp->opts.disAsm)
+                        {
+                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because threshold. loopSize= %d, "
+                                   "emitComp->compJitAlignLoopMaxCodeSize= %d in (%s)\n",
+                                   loopSize, emitComp->compJitAlignLoopMaxCodeSize, emitComp->info.compMethodName);
+                        }
+                        // If next instruction is align, skip it so
+                        // we do not check the heuristics again.
+                        if (nextId->idIns() == INS_align)
+                        {
+                            assert(emitComp->compJitAlignLoopBoundary > 16);
+                            nextId->idCodeSize(0);
+                        }
                     }
                 }
 
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 72781030666d73..59a9e08d9ff0f8 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -41,10 +41,10 @@ CONFIG_INTEGER(JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0) // In deb
                                                                        // optimizations are performed on the fast path.
 CONFIG_INTEGER(JitDefaultFill, W("JitDefaultFill"), 0xdd) // In debug builds, initialize the memory allocated by the nra
                                                           // with this byte.
-CONFIG_INTEGER(JitAlignLoopMinBlockWeight, W("JitAlignLoopMinBlockWeight"), 0xffffffff) // Minimum weight needed of the first block of a loop to trigger its alignment.
+CONFIG_INTEGER(JitAlignLoopMinBlockWeight, W("JitAlignLoopMinBlockWeight"), 10) // Minimum weight needed of the first block of a loop to trigger its alignment.
 CONFIG_INTEGER(JitAlignLoopMaxCodeSize,
                W("JitAlignLoopMaxCodeSize"),
-               0)  // Maximum code size (in bytes) of the loop after which the the code alignment for that loop will be disabled.
+               0x60)  // Maximum code size (in bytes) of the loop after which the the code alignment for that loop will be disabled.
 CONFIG_INTEGER(JitAlignLoopBoundary,
                W("JitAlignLoopBoundary"),
                0x32) // Boundary (multiples of 2) at which inner loops should be aliged. By default, it is set to 32B.
@@ -52,6 +52,10 @@ CONFIG_INTEGER(JitAlignLoopForJcc,
                W("JitAlignLoopForJcc"),
                0) // If set, while doing loop alignment, ensure loop jmps don't cross alignment boundary.
 
+CONFIG_INTEGER(JitAlignLoopAdaptive,
+            W("JitAlignLoopAdaptive"),
+            1) // If set, perform loop alignment adaptive to limit number of padding added.
+
 CONFIG_INTEGER(JitDirectAlloc, W("JitDirectAlloc"), 0)
 CONFIG_INTEGER(JitDoubleAlign, W("JitDoubleAlign"), 1)
 CONFIG_INTEGER(JitDumpASCII, W("JitDumpASCII"), 1)         // Uses only ASCII characters in tree dumps

From 072a1132061c234b68ccb5b1dfd95bba3a5c7286 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 5 Nov 2020 10:59:12 -0800
Subject: [PATCH 17/59] wip

---
 src/coreclr/jit/codegenlinear.cpp |  5 +++--
 src/coreclr/jit/compiler.cpp      |  6 ++++--
 src/coreclr/jit/emitarm.cpp       | 15 ---------------
 src/coreclr/jit/emitarm.h         |  2 --
 src/coreclr/jit/emitarm64.h       |  2 --
 src/coreclr/jit/optimizer.cpp     |  2 ++
 6 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index eea946afd2910b..aa5f0a061e49c6 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -767,6 +767,7 @@ void CodeGen::genCodeForBBlist()
                 break;
         }
 
+#if defined(TARGET_XARCH)
         if ((block->bbNext != nullptr) && (block->bbNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP))
         {
             if (verbose)
@@ -775,10 +776,9 @@ void CodeGen::genCodeForBBlist()
             }
             if ((compiler->compJitAlignLoopBoundary > 16) && (!compiler->compJitAlignLoopAdaptive))
             {
-#if defined(TARGET_XARCH)
                 //TODO: Only do this if we are confident that the loop size doesn't exceed the heuristics threshold
                 GetEmitter()->emitVariableLoopAlign();
-#endif
+
             }
             else
             {
@@ -789,6 +789,7 @@ void CodeGen::genCodeForBBlist()
             // all IGs that follows this IG and participate in a loop.
             GetEmitter()->emitCurIG->igFlags |= IGF_ALIGN_LOOP;
         }
+#endif
 
 #if defined(DEBUG) && defined(USING_VARIABLE_LIVE_RANGE)
         if (compiler->verbose)
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 0c04cfe5d02295..74e8754b4b4d43 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2619,7 +2619,8 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     compJitAlignLoopMaxCodeSize    = JitConfig.JitAlignLoopMaxCodeSize();
     compJitAlignLoopBoundary       = ReinterpretHexAsDecimal(JitConfig.JitAlignLoopBoundary());
     compJitAlignLoopForJcc         = JitConfig.JitAlignLoopForJcc() == 1;
-    compJitAlignLoopAdaptive       = JitConfig.JitAlignLoopAdaptive() == 1;
+    // TODO: Default loop adaptive
+    compJitAlignLoopAdaptive       = true; //JitConfig.JitAlignLoopAdaptive() == 1;
     assert(isPow2(compJitAlignLoopBoundary));
 
 
@@ -3934,7 +3935,8 @@ void Compiler::compSetOptimizationLevel()
         else
         {
             codeGen->SetAlignLoops(opts.jitFlags->IsSet(JitFlags::JIT_FLAG_ALIGN_LOOPS));
-            //codeGen->SetAlignLoops(true);
+            // TODO: Default AlignLoop
+            codeGen->SetAlignLoops(true);
         }
     }
 
diff --git a/src/coreclr/jit/emitarm.cpp b/src/coreclr/jit/emitarm.cpp
index 48f754b3ab06f5..a2001cf0ffcd5a 100644
--- a/src/coreclr/jit/emitarm.cpp
+++ b/src/coreclr/jit/emitarm.cpp
@@ -1468,21 +1468,6 @@ void emitter::emitIns(instruction ins)
     appendToCurIG(id);
 }
 
-/*****************************************************************************
- *
- *  Add a NOP instructions to pad the instruction stream by (size / 4) bytes.
- */
-
-void emitter::emitLoopAlign()
-{
-    // Max out at 28 bytes of nop...
-    // 32 is the largest method entry alignment we support.
-    for (unsigned i = 0; i < 4; i++)
-    {
-        emitIns(INS_nop);
-    }
-}
-
 /*****************************************************************************
  *
  *  Add an instruction with a single immediate value.
diff --git a/src/coreclr/jit/emitarm.h b/src/coreclr/jit/emitarm.h
index b1e2512d2b09d1..e663a953e7a10b 100644
--- a/src/coreclr/jit/emitarm.h
+++ b/src/coreclr/jit/emitarm.h
@@ -215,8 +215,6 @@ static bool emitIns_valid_imm_for_vldst_offset(int imm);
 
 void emitIns(instruction ins);
 
-void emitLoopAlign();
-
 void emitIns_I(instruction ins, emitAttr attr, target_ssize_t imm);
 
 void emitIns_R(instruction ins, emitAttr attr, regNumber reg);
diff --git a/src/coreclr/jit/emitarm64.h b/src/coreclr/jit/emitarm64.h
index 403ee087082b9c..8b8c812aacfc41 100644
--- a/src/coreclr/jit/emitarm64.h
+++ b/src/coreclr/jit/emitarm64.h
@@ -723,8 +723,6 @@ inline static ssize_t computeRelPageAddr(size_t dstAddr, size_t srcAddr)
 public:
 void emitIns(instruction ins);
 
-void emitLoopAlign();
-
 void emitIns_I(instruction ins, emitAttr attr, ssize_t imm);
 
 void emitIns_R(instruction ins, emitAttr attr, regNumber reg);
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 599c6d2b01f5be..061127112260d1 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2543,6 +2543,7 @@ void Compiler::optFindNaturalLoops()
             assert(blk->bbNext != nullptr); // We should never reach nullptr.
         }
 
+#if defined(TARGET_XARCH)
         //TODO: Move should align loops flag to jitconfigvalues.h
         if (codeGen->ShouldAlignLoops())
         {
@@ -2554,6 +2555,7 @@ void Compiler::optFindNaturalLoops()
             }
         }
     }
+#endif
 
     // Make sure that loops are canonical: that every loop has a unique "top", by creating an empty "nop"
     // one, if necessary, for loops containing others that share a "top."

From 7e1328adb868b4871cf5556b5b908d86ccaa8d59 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 5 Nov 2020 11:00:20 -0800
Subject: [PATCH 18/59] revert emitarm64.cpp changes

---
 src/coreclr/jit/emitarm64.cpp | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/src/coreclr/jit/emitarm64.cpp b/src/coreclr/jit/emitarm64.cpp
index 7a2df2a4b0ac21..2f78d20a713967 100644
--- a/src/coreclr/jit/emitarm64.cpp
+++ b/src/coreclr/jit/emitarm64.cpp
@@ -3625,21 +3625,6 @@ void emitter::emitIns(instruction ins)
     appendToCurIG(id);
 }
 
-/*****************************************************************************
- *
- *  Add a NOP instructions to pad the instruction stream by (size / 4) bytes.
- */
-
-void emitter::emitLoopAlign()
-{
-    // Max out at 28 bytes of nop...
-    // 32 is the largest method entry alignment we support.
-    for (unsigned i = 0; i < 4; i++)
-    {
-        emitIns(INS_nop);
-    }
-}
-
 /*****************************************************************************
  *
  *  Add an instruction with a single immediate value.

From d49e84d88492505260994dcfc2ce11f7b72a5432 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 5 Nov 2020 11:44:20 -0800
Subject: [PATCH 19/59] fix errors during merge

---
 src/coreclr/jit/block.h  | 2 +-
 src/coreclr/jit/emit.cpp | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h
index 148dd24557c731..41b0ba8a0d94da 100644
--- a/src/coreclr/jit/block.h
+++ b/src/coreclr/jit/block.h
@@ -448,7 +448,7 @@ struct BasicBlock : private LIR::Range
 
 #define BBF_PATCHPOINT                     MAKE_BBFLAG(36) // Block is a patchpoint
 #define BBF_HAS_CLASS_PROFILE              MAKE_BBFLAG(37) // BB contains a call needing a class profile
-#define BBF_FIRST_BLOCK_IN_INNERLOOP      0x8000000000 // Block is lexically the fist block within the innermost loop.
+#define BBF_FIRST_BLOCK_IN_INNERLOOP       MAKE_BBFLAG(39) // Block is lexically the fist block within the innermost loop.
 
 // clang-format on
 
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 9a3f2740d95051..a21b1a9ef67980 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -5071,8 +5071,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
 #define DEFAULT_CODE_BUFFER_INIT 0xcc
 
 #ifdef DEBUG
-    *instrCount      = 0;
-    bool isColdBlock = false;
+    *instrCount = 0;
 #endif
     for (insGroup* ig = emitIGlist; ig != nullptr; ig = ig->igNext)
     {
@@ -5084,8 +5083,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
             assert(emitCurCodeOffs(cp) == emitTotalHotCodeSize);
 
             assert(coldCodeBlock);
-            cp          = coldCodeBlock;
-            isColdBlock = true;
+            cp = coldCodeBlock;
 #ifdef DEBUG
             if (emitComp->opts.disAsm || emitComp->verbose)
             {

From f6b5135a58059aac3ec0fa95bb81453716eb1b51 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 5 Nov 2020 12:13:30 -0800
Subject: [PATCH 20/59] fix build errors

---
 src/coreclr/jit/codegenlinear.cpp |  4 ++++
 src/coreclr/jit/compiler.cpp      |  8 ++++----
 src/coreclr/jit/emitxarch.cpp     | 21 ++++++++++++++-------
 src/coreclr/jit/optimizer.cpp     |  2 +-
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index aa5f0a061e49c6..9f1ef2aff34fa3 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -754,10 +754,12 @@ void CodeGen::genCodeForBBlist()
                     if (dstIG != nullptr && dstIG->igNum <= srcIG->igNum)
                     {
                         srcIG->igLoopBackEdge = dstIG;
+#ifdef DEBUG
                         if (verbose)
                         {
                             printf("** IG_%d jumps back to IG_%d forming a loop.\n", srcIG->igNum, dstIG->igNum);
                         }
+#endif
                     }
                 }
                 break;
@@ -770,10 +772,12 @@ void CodeGen::genCodeForBBlist()
 #if defined(TARGET_XARCH)
         if ((block->bbNext != nullptr) && (block->bbNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP))
         {
+#ifdef DEBUG
             if (verbose)
             {
                 printf("Adding 'align' instruction to align loop header block " FMT_BB, block->bbNext->bbNum);
             }
+#endif
             if ((compiler->compJitAlignLoopBoundary > 16) && (!compiler->compJitAlignLoopAdaptive))
             {
                 //TODO: Only do this if we are confident that the loop size doesn't exceed the heuristics threshold
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 74e8754b4b4d43..1b66887c495aea 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2615,10 +2615,10 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     opts.compDbgInfo = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_INFO);
     opts.compDbgEnC  = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_EnC);
 
-    compJitAlignLoopMinBlockWeight = JitConfig.JitAlignLoopMinBlockWeight();
-    compJitAlignLoopMaxCodeSize    = JitConfig.JitAlignLoopMaxCodeSize();
-    compJitAlignLoopBoundary       = ReinterpretHexAsDecimal(JitConfig.JitAlignLoopBoundary());
-    compJitAlignLoopForJcc         = JitConfig.JitAlignLoopForJcc() == 1;
+    compJitAlignLoopMinBlockWeight = 10; //JitConfig.JitAlignLoopMinBlockWeight();
+    compJitAlignLoopMaxCodeSize    = 0x60; //JitConfig.JitAlignLoopMaxCodeSize();
+    compJitAlignLoopBoundary       = 32; //ReinterpretHexAsDecimal(JitConfig.JitAlignLoopBoundary());
+    compJitAlignLoopForJcc         = false; //JitConfig.JitAlignLoopForJcc() == 1;
     // TODO: Default loop adaptive
     compJitAlignLoopAdaptive       = true; //JitConfig.JitAlignLoopAdaptive() == 1;
     assert(isPow2(compJitAlignLoopBoundary));
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 185b25ecd89469..9d3ccd3d206296 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12665,7 +12665,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     if (minBlocksNeededForLoop > maxBlocksAllowedForLoop)
                     {
                         skipPadding = true;
-
+#if DEBUG
                         if (emitComp->opts.disAsm)
                         {
                             printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because maxBlocksAllowed. loopSize= %d, "
@@ -12674,6 +12674,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                                    loopSize, minBlocksNeededForLoop, alignmentBoundary, nPaddingBytes, nMaxPaddingBytes,
                                    emitComp->info.compMethodName);
                         }
+#endif
                     }
                     else if (nPaddingBytes > nMaxPaddingBytes)
                     {
@@ -12685,7 +12686,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                         if (nPaddingBytes > nMaxPaddingBytes)
                         {
                             skipPadding = true;
-
+#if DEBUG
                             if (emitComp->opts.disAsm)
                             {
                                 printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because maxPaddingAllowed. loopSize= %d, "
@@ -12694,6 +12695,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                                        loopSize, minBlocksNeededForLoop, alignmentBoundary, nPaddingBytes,
                                        nMaxPaddingBytes, emitComp->info.compMethodName);
                             }
+#endif
                         }
                     }
 
@@ -12707,19 +12709,20 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                         if (currentOffset <= extraBytesNotInLoop)
                         {
                             skipPadding = true;
-
+#if DEBUG
                             if (emitComp->opts.disAsm)
                             {
                                 printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset < "
                                        "extraBytesNotInLoop) in (%s)\n",
                                        emitComp->info.compMethodName);
                             }
+#endif
                         }
                         else
                         {
                             dst = emitOutputNOP(dst, nPaddingBytes);
                             assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
-
+#if DEBUG
                             if (emitComp->opts.disAsm)
                             {
                                 printf("; ~~~~~~~~~~~~~~~~~~~~~~ alignment= %d bytes, loopsize= %d bytes, "
@@ -12727,6 +12730,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                                        nPaddingBytes, loopSize, minBlocksNeededForLoop, extraBytesNotInLoop, alignmentBoundary,
                                        emitComp->info.compMethodName);
                             }
+#endif
                         }
                     }
 
@@ -12793,13 +12797,14 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                         // Otherwise, the loop just fits in minBlocksNeededForLoop and so no alignment needed.
                         if (currentOffset <= extraBytesNotInLoop)
                         {
+#if DEBUG
                             if (emitComp->opts.disAsm)
                             {
                                 printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset < "
                                        "extraBytesNotInLoop) in (%s)\n",
                                        emitComp->info.compMethodName);
                             }
-
+#endif
                             if (nextId->idIns() == INS_align)
                             {
                                 assert(emitComp->compJitAlignLoopBoundary > 16);
@@ -12826,7 +12831,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 #endif
                                 }
                             }
-
+#if DEBUG
                             if (emitComp->opts.disAsm)
                             {
                                 printf("; ~~~~~~~~~~~~~~~~~~~~~~ alignment= %d bytes, loopsize= %d bytes, "
@@ -12834,19 +12839,21 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                                        nBytes, loopSize, minimumBlocksNeeded, extraBytesNotInLoop,
                                        emitComp->info.compMethodName);
                             }
-
+#endif
                             // In the end dst should be at alignment boundary
                             assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
                         }
                     }
                     else
                     {
+#if DEBUG
                         if (emitComp->opts.disAsm)
                         {
                             printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because threshold. loopSize= %d, "
                                    "emitComp->compJitAlignLoopMaxCodeSize= %d in (%s)\n",
                                    loopSize, emitComp->compJitAlignLoopMaxCodeSize, emitComp->info.compMethodName);
                         }
+#endif
                         // If next instruction is align, skip it so
                         // we do not check the heuristics again.
                         if (nextId->idIns() == INS_align)
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 061127112260d1..87fed3ee6c1be5 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2554,8 +2554,8 @@ void Compiler::optFindNaturalLoops()
                 first->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
             }
         }
-    }
 #endif
+    }
 
     // Make sure that loops are canonical: that every loop has a unique "top", by creating an empty "nop"
     // one, if necessary, for loops containing others that share a "top."

From 256aae5fefce9dd8db29f3703ee1c26e2112cff9 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 6 Nov 2020 13:26:17 -0800
Subject: [PATCH 21/59] refactoring and cleanup

---
 src/coreclr/jit/codegenlinear.cpp |   8 ++-
 src/coreclr/jit/compiler.cpp      |  19 +++---
 src/coreclr/jit/compiler.h        |  33 ++++++----
 src/coreclr/jit/emit.cpp          |   4 +-
 src/coreclr/jit/emitxarch.cpp     | 102 ++++++++++++++----------------
 src/coreclr/jit/emitxarch.h       |   3 +
 src/coreclr/jit/jitconfigvalues.h |  18 ++++--
 src/coreclr/jit/optimizer.cpp     |   9 ++-
 8 files changed, 106 insertions(+), 90 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index 9f1ef2aff34fa3..a21be172e813c3 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -772,19 +772,21 @@ void CodeGen::genCodeForBBlist()
 #if defined(TARGET_XARCH)
         if ((block->bbNext != nullptr) && (block->bbNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP))
         {
-#ifdef DEBUG
+            assert(ShouldAlignLoops());
+#ifndef ADAPTIVE_LOOP_ALIGNMENT
             if (verbose)
             {
                 printf("Adding 'align' instruction to align loop header block " FMT_BB, block->bbNext->bbNum);
             }
-#endif
-            if ((compiler->compJitAlignLoopBoundary > 16) && (!compiler->compJitAlignLoopAdaptive))
+
+            if ((compiler->opts.compJitAlignLoopBoundary > 16) && (!compiler->opts.compJitAlignLoopAdaptive))
             {
                 //TODO: Only do this if we are confident that the loop size doesn't exceed the heuristics threshold
                 GetEmitter()->emitVariableLoopAlign();
 
             }
             else
+#endif
             {
                 GetEmitter()->emitLoopAlign();
             }
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 1b66887c495aea..9b82a549e1d9e6 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2615,13 +2615,14 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     opts.compDbgInfo = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_INFO);
     opts.compDbgEnC  = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_EnC);
 
-    compJitAlignLoopMinBlockWeight = 10; //JitConfig.JitAlignLoopMinBlockWeight();
-    compJitAlignLoopMaxCodeSize    = 0x60; //JitConfig.JitAlignLoopMaxCodeSize();
-    compJitAlignLoopBoundary       = 32; //ReinterpretHexAsDecimal(JitConfig.JitAlignLoopBoundary());
-    compJitAlignLoopForJcc         = false; //JitConfig.JitAlignLoopForJcc() == 1;
-    // TODO: Default loop adaptive
-    compJitAlignLoopAdaptive       = true; //JitConfig.JitAlignLoopAdaptive() == 1;
-    assert(isPow2(compJitAlignLoopBoundary));
+#ifdef DEBUG
+    opts.compJitAlignLoopMinBlockWeight = JitConfig.JitAlignLoopMinBlockWeight();
+    opts.compJitAlignLoopMaxCodeSize    = JitConfig.JitAlignLoopMaxCodeSize();
+    opts.compJitAlignLoopBoundary       = JitConfig.JitAlignLoopBoundary();
+    opts.compJitAlignLoopForJcc = JitConfig.JitAlignLoopForJcc() == 1;
+    opts.compJitAlignLoopAdaptive = JitConfig.JitAlignLoopAdaptive() == 1;
+    assert(isPow2(opts.compJitAlignLoopBoundary));
+#endif
 
 
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
@@ -3934,9 +3935,7 @@ void Compiler::compSetOptimizationLevel()
         }
         else
         {
-            codeGen->SetAlignLoops(opts.jitFlags->IsSet(JitFlags::JIT_FLAG_ALIGN_LOOPS));
-            // TODO: Default AlignLoop
-            codeGen->SetAlignLoops(true);
+            codeGen->SetAlignLoops(JitConfig.JitAlignLoops() == 1);
         }
     }
 
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 32fd8db70f87f8..1480a0025ff93d 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -2234,20 +2234,6 @@ class Compiler
 public:
     hashBvGlobalData hbvGlobalData; // Used by the hashBv bitvector package.
 
-/*
-* Loop alignment heuristics
-* These are overriden by the COMPlus_ variables, but in future, 
-*/
-
-//#define ALIGN_LOOP_MIN_BB_WEIGHT 100 // Minimum average hits a block should get in order to be considered as hot for loop alignment.
-//#define ALIGN_LOOP_MAX_CODE_SIZE 20 // Maximum code size of a loop for which loop alignment will be done.
-
-    unsigned compJitAlignLoopMinBlockWeight;
-    unsigned compJitAlignLoopMaxCodeSize;
-    unsigned compJitAlignLoopBoundary;
-    bool     compJitAlignLoopForJcc;
-    bool     compJitAlignLoopAdaptive;
-
 #ifdef DEBUG
     bool verbose;
     bool verboseTrees;
@@ -9050,6 +9036,25 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
         bool dspGCtbls;       // Display the GC tables
 #endif
 
+#define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 10
+#define DEFAULT_ALIGN_LOOP_BOUNDARY 32
+
+#ifdef DEBUG
+        // Loop alignment variables
+        unsigned compJitAlignLoopMinBlockWeight; // Minimum weight needed for the first block of a loop to make it a
+                                                 // candidate for alignment.
+        unsigned compJitAlignLoopMaxCodeSize;    // For non-adaptive alignment, minimum loop size (in bytes) for which
+                                                 // alignment will be done.
+        unsigned compJitAlignLoopBoundary;       // For non-adaptive alignment, address boundary (power of 2) at which
+                                                 // loop alignment should be done. By default, 32B.
+        bool     compJitAlignLoopForJcc;         // If set, for non-adaptive alignment, ensure loop jmps are not on or
+                                                 // cross alignment boundary.
+        bool     compJitAlignLoopAdaptive;       // If set, perform adaptive loop alignment that limits number of padding
+                                                 // based on loop size.
+#else
+        #define ADAPTIVE_LOOP_ALIGNMENT
+#endif
+
 #ifdef LATE_DISASM
         bool doLateDisasm; // Run the late disassembler
 #endif                     // LATE_DISASM
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index a21b1a9ef67980..12f6ed8b52600e 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -5225,7 +5225,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
             
             if ((emitComp->opts.disAsm || emitComp->verbose) && emitComp->opts.disAddr)
             {
-                size_t lastBoundaryAddr = (size_t)cp & ~((size_t)emitComp->compJitAlignLoopBoundary - 1);
+                size_t lastBoundaryAddr = (size_t)cp & ~((size_t)emitComp->opts.compJitAlignLoopBoundary - 1);
 
                 // draw boundary if lastCp was before the lastBoundary.
                 if (lastCp < lastBoundaryAddr)
@@ -5260,7 +5260,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
                     {
                         printf("...............................");
                     }
-                    printf(" %dB boundary ...............................\n", (emitComp->compJitAlignLoopBoundary));
+                    printf(" %dB boundary ...............................\n", (emitComp->opts.compJitAlignLoopBoundary));
                 }
             }
 #endif
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 9d3ccd3d206296..8d4cd5e223d540 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2669,6 +2669,7 @@ void emitter::emitLoopAlign()
     emitCurIGsize += 15;
 }
 
+#ifndef ADAPTIVE_LOOP_ALIGNMENT
 /*****************************************************************************
  *
  *  The next instruction will be a loop head entry point
@@ -2678,8 +2679,8 @@ void emitter::emitLoopAlign()
 
 void emitter::emitVariableLoopAlign()
 {
-    unsigned insAlignCount = (emitComp->compJitAlignLoopBoundary - 1) / 15;
-    unsigned lastInsAlignSize = (emitComp->compJitAlignLoopBoundary - 1) % 15;
+    unsigned insAlignCount = (emitComp->opts.compJitAlignLoopBoundary - 1) / 15;
+    unsigned lastInsAlignSize = (emitComp->opts.compJitAlignLoopBoundary - 1) % 15;
 
     while (insAlignCount)
     {
@@ -2698,6 +2699,7 @@ void emitter::emitVariableLoopAlign()
         emitCurIGsize += lastInsAlignSize;
     }
 }
+#endif
 
 /*****************************************************************************
  *
@@ -12631,7 +12633,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 // Candidate for loop alignment
                 assert(codeGen->ShouldAlignLoops());
                 assert(ig->igFlags & IGF_ALIGN_LOOP);
-                unsigned alignmentBoundary = emitComp->compJitAlignLoopBoundary;
+#ifdef DEBUG
+                unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
+#else
+                unsigned alignmentBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY;
+#endif
                 sz                         = SMALL_IDSC_SIZE;
 
                 // If already at alignment boundary, no need to emit anything.
@@ -12640,7 +12646,13 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     break;
                 }
 
-                if (emitComp->compJitAlignLoopAdaptive)
+#if DEBUG
+                bool displayAlignmentDetails = (emitComp->opts.disAsm & emitComp->opts.disAddr) || emitComp->verbose;
+#endif
+
+#ifndef ADAPTIVE_LOOP_ALIGNMENT
+                if (emitComp->opts.compJitAlignLoopAdaptive)
+#endif
                 {
                     // calculate the loop size
                     unsigned  loopSize     = 0;
@@ -12653,7 +12665,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                             break;
                         }
                     }
-
+                    //TODO: See if comparing loopSize > 128 would be sensible instead?
+                    //TODO: code cleanup
                     // Start to align on 32B boundary with a fallback to 16B boundary
                     alignmentBoundary                = 32;
                     int      minBlocksNeededForLoop  = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
@@ -12666,13 +12679,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     {
                         skipPadding = true;
 #if DEBUG
-                        if (emitComp->opts.disAsm)
+                        if (displayAlignmentDetails)
                         {
-                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because maxBlocksAllowed. loopSize= %d, "
-                                   "minBlocksNeededForLoop= %d, alignmentBoundary= %d, nPaddingBytes= %d, "
-                                   "nMaxPaddingBytes= %d in (%s)\n",
-                                   loopSize, minBlocksNeededForLoop, alignmentBoundary, nPaddingBytes, nMaxPaddingBytes,
-                                   emitComp->info.compMethodName);
+                            printf("\t\t;; Skip alignment: 'Loopsize= %d bytes.' in (%s)\n",
+                                   loopSize, emitComp->info.compFullName);
                         }
 #endif
                     }
@@ -12687,13 +12697,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                         {
                             skipPadding = true;
 #if DEBUG
-                            if (emitComp->opts.disAsm)
+                            if (displayAlignmentDetails)
                             {
-                                printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because maxPaddingAllowed. loopSize= %d, "
-                                       "minBlocksNeededForLoop= %d, alignmentBoundary= %d, nPaddingBytes= %d, "
-                                       "nMaxPaddingBytes= %d in (%s)\n",
-                                       loopSize, minBlocksNeededForLoop, alignmentBoundary, nPaddingBytes,
-                                       nMaxPaddingBytes, emitComp->info.compMethodName);
+                                printf("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, AlignmentBoundary= %dB.' in (%s)\n",
+                                    nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary, emitComp->info.compFullName);
                             }
 #endif
                         }
@@ -12710,10 +12717,9 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                         {
                             skipPadding = true;
 #if DEBUG
-                            if (emitComp->opts.disAsm)
+                            if (displayAlignmentDetails)
                             {
-                                printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset < "
-                                       "extraBytesNotInLoop) in (%s)\n",
+                                printf("\t\t;; Skip alignment: 'Loop already aligned.' in (%s)\n",
                                        emitComp->info.compMethodName);
                             }
 #endif
@@ -12723,12 +12729,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                             dst = emitOutputNOP(dst, nPaddingBytes);
                             assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
 #if DEBUG
-                            if (emitComp->opts.disAsm)
+                            if (displayAlignmentDetails)
                             {
-                                printf("; ~~~~~~~~~~~~~~~~~~~~~~ alignment= %d bytes, loopsize= %d bytes, "
-                                       "minBlocksNeeded= %d, extraBytesNotInLoop= %d, alignmentBoundary= %dB in (%s)\n",
-                                       nPaddingBytes, loopSize, minBlocksNeededForLoop, extraBytesNotInLoop, alignmentBoundary,
-                                       emitComp->info.compMethodName);
+                                printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, MinBlocks= %d, AlignmentBoundary= %dB.' in (%s)\n",
+                                       nPaddingBytes, loopSize, minBlocksNeededForLoop, alignmentBoundary, emitComp->info.compFullName);
                             }
 #endif
                         }
@@ -12746,6 +12750,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                         ig->igFlags |= IGF_UPD_ISZ;
                     }
                 }
+#ifndef ADAPTIVE_LOOP_ALIGNMENT
                 else
                 {
                     instrDesc* nextId = id;
@@ -12758,7 +12763,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     {
                         if (nextId->idIns() == INS_align)
                         {
-                            assert(emitComp->compJitAlignLoopBoundary > 16);
+                            assert(alignmentBoundary > 16);
                             nextId->idCodeSize(0);
                         }
                         break;
@@ -12770,44 +12775,40 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     {
                         loopSize += igInLoop->igSize;
                         if ((igInLoop->igLoopBackEdge == loopHeaderIg) ||
-                            (loopSize > emitComp->compJitAlignLoopMaxCodeSize))
+                            (loopSize > emitComp->opts.compJitAlignLoopMaxCodeSize))
                         {
                             break;
                         }
                     }
 
                     // Only align if it matches the heuristics
-                    if (loopSize <= emitComp->compJitAlignLoopMaxCodeSize)
+                    if (loopSize <= emitComp->opts.compJitAlignLoopMaxCodeSize)
                     {
                         int    minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
                         int    extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
                         size_t currentOffset       = (size_t)dst % alignmentBoundary;
 
                         // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary
-                        if (emitComp->compJitAlignLoopForJcc)
+                        if (emitComp->opts.compJitAlignLoopForJcc)
                         {
                             // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing?
                             currentOffset++;
                         }
 
-                        // TODO: Add some kind of maxLoopPadding to something smaller so we don't see lot of pads added?
                         // TODO: Revisit nop sequence we emit in case of 31 bytes
 
                         // Padding is needed only if loop starts at or after the current offset.
                         // Otherwise, the loop just fits in minBlocksNeededForLoop and so no alignment needed.
                         if (currentOffset <= extraBytesNotInLoop)
                         {
-#if DEBUG
-                            if (emitComp->opts.disAsm)
+                            if (displayAlignmentDetails)
                             {
-                                printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because (currentOffset < "
-                                       "extraBytesNotInLoop) in (%s)\n",
+                                printf("\t\t;; Skip alignment: 'Loop already aligned.' in (%s)\n",
                                        emitComp->info.compMethodName);
                             }
-#endif
                             if (nextId->idIns() == INS_align)
                             {
-                                assert(emitComp->compJitAlignLoopBoundary > 16);
+                                assert(alignmentBoundary > 16);
                                 nextId->idCodeSize(0);
                             }
                         }
@@ -12819,50 +12820,45 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                             if (nextId->idIns() == INS_align)
                             {
                                 // If next instruction is also alignment, this better be 32B padding.
-                                assert(emitComp->compJitAlignLoopBoundary > 16);
+                                assert(alignmentBoundary > 16);
 
                                 // Align further to 32B boundary, if it is not yet.
                                 if (((size_t)dst & 0x1f) != 0)
                                 {
                                     dst = emitOutputNOP(dst, 15);
                                     dst = emitOutputNOP(dst, 1);
-#if DEBUG
                                     nBytes += 16;
-#endif
                                 }
                             }
-#if DEBUG
-                            if (emitComp->opts.disAsm)
+                            if (displayAlignmentDetails)
                             {
-                                printf("; ~~~~~~~~~~~~~~~~~~~~~~ alignment= %d bytes, loopsize= %d bytes, "
-                                       "minBlocksNeeded= %d, extraBytesNotInLoop= %d in (%s)\n",
-                                       nBytes, loopSize, minimumBlocksNeeded, extraBytesNotInLoop,
-                                       emitComp->info.compMethodName);
+                                printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, MinBlocks= %d, "
+                                       "AlignmentBoundary= %dB.' in (%s)\n", nBytes,
+                                       loopSize, minimumBlocksNeeded, alignmentBoundary,
+                                       emitComp->info.compFullName);
                             }
-#endif
                             // In the end dst should be at alignment boundary
                             assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
                         }
                     }
                     else
                     {
-#if DEBUG
-                        if (emitComp->opts.disAsm)
+                        if (displayAlignmentDetails)
                         {
-                            printf("; ~~~~~~~~~~~~~~~~~~~~~~ Skipping because threshold. loopSize= %d, "
-                                   "emitComp->compJitAlignLoopMaxCodeSize= %d in (%s)\n",
-                                   loopSize, emitComp->compJitAlignLoopMaxCodeSize, emitComp->info.compMethodName);
+                            printf("\t\t;; Skip alignment: 'Loopsize= %d, AllowedMaxSize= %d.' in (%s)\n",
+                                   loopSize, emitComp->opts.compJitAlignLoopMaxCodeSize,
+                                   emitComp->info.compFullName);
                         }
-#endif
                         // If next instruction is align, skip it so
                         // we do not check the heuristics again.
                         if (nextId->idIns() == INS_align)
                         {
-                            assert(emitComp->compJitAlignLoopBoundary > 16);
+                            assert(alignmentBoundary > 16);
                             nextId->idCodeSize(0);
                         }
                     }
                 }
+#endif
 
                 break;
             }
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 409cfa442a689d..566d48581be3e3 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -289,8 +289,11 @@ inline emitAttr emitDecodeScale(unsigned ensz)
 public:
 void emitLoopAlign();
 
+#ifndef ADAPTIVE_LOOP_ALIGNMENT
 void emitVariableLoopAlign();
 
+#endif
+
 void emitIns(instruction ins);
 
 void emitIns(instruction ins, emitAttr attr);
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 59a9e08d9ff0f8..547900c420884d 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -41,20 +41,24 @@ CONFIG_INTEGER(JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0) // In deb
                                                                        // optimizations are performed on the fast path.
 CONFIG_INTEGER(JitDefaultFill, W("JitDefaultFill"), 0xdd) // In debug builds, initialize the memory allocated by the nra
                                                           // with this byte.
-CONFIG_INTEGER(JitAlignLoopMinBlockWeight, W("JitAlignLoopMinBlockWeight"), 10) // Minimum weight needed of the first block of a loop to trigger its alignment.
+CONFIG_INTEGER(JitAlignLoopMinBlockWeight,
+               W("JitAlignLoopMinBlockWeight"),
+               DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT) // Minimum weight needed for the first block of a loop to make it a
+                                                    // candidate for alignment.
 CONFIG_INTEGER(JitAlignLoopMaxCodeSize,
                W("JitAlignLoopMaxCodeSize"),
-               0x60)  // Maximum code size (in bytes) of the loop after which the the code alignment for that loop will be disabled.
+               0x60)  // For non-adaptive alignment, minimum loop size (in bytes) for which alignment will be done. Defaults to 3 blocks of 32 bytes chunks = 96 bytes.
 CONFIG_INTEGER(JitAlignLoopBoundary,
                W("JitAlignLoopBoundary"),
-               0x32) // Boundary (multiples of 2) at which inner loops should be aliged. By default, it is set to 32B.
+               DEFAULT_ALIGN_LOOP_BOUNDARY) // For non-adaptive alignment, address boundary (power of 2) at which loop
+                                            // alignment should be done. By default, 32B.
 CONFIG_INTEGER(JitAlignLoopForJcc,
                W("JitAlignLoopForJcc"),
-               0) // If set, while doing loop alignment, ensure loop jmps don't cross alignment boundary.
+               0) // If set, for non-adaptive alignment, ensure loop jmps are not on or cross alignment boundary.
 
 CONFIG_INTEGER(JitAlignLoopAdaptive,
             W("JitAlignLoopAdaptive"),
-            1) // If set, perform loop alignment adaptive to limit number of padding added.
+            1) // If set, perform adaptive loop alignment that limits number of padding based on loop size.
 
 CONFIG_INTEGER(JitDirectAlloc, W("JitDirectAlloc"), 0)
 CONFIG_INTEGER(JitDoubleAlign, W("JitDoubleAlign"), 1)
@@ -217,7 +221,9 @@ CONFIG_INTEGER(EnableIncompleteISAClass, W("EnableIncompleteISAClass"), 0) // En
                                                                            // intrinsic classes
 #endif                                                                     // defined(DEBUG)
 
-///
+CONFIG_INTEGER(JitAlignLoops, W("JitAlignLoops"), 1) // If set, align inner loops
+
+                                                    ///
 /// JIT
 ///
 #ifdef FEATURE_ENABLE_NO_RANGE_CHECKS
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 87fed3ee6c1be5..0c282baf83d4b2 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2544,12 +2544,17 @@ void Compiler::optFindNaturalLoops()
         }
 
 #if defined(TARGET_XARCH)
-        //TODO: Move should align loops flag to jitconfigvalues.h
         if (codeGen->ShouldAlignLoops())
         {
             // An innerloop candidate that might need alignment
             if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) &&
-                (compJitAlignLoopMinBlockWeight <= first->getBBWeight(this)))
+                (
+#ifdef ADAPTIVE_LOOP_ALIGNMENT
+                    DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT
+#else
+                    opts.compJitAlignLoopMinBlockWeight
+#endif
+                    <= first->getBBWeight(this)))
             {
                 first->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
             }

From 26c3c84c9b9277055854a7a27c868d0a0cd1dc32 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 6 Nov 2020 16:06:52 -0800
Subject: [PATCH 22/59] refactoring and build errors fix

---
 src/coreclr/jit/codegencommon.cpp |  5 ++--
 src/coreclr/jit/emitxarch.cpp     | 40 +++++++++++++++----------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index d6eebc9d416152..26d2bcae753fb2 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -2345,9 +2345,10 @@ void CodeGen::genEmitMachineCode()
 #ifdef DEBUG
     if (compiler->opts.disAsm || verbose)
     {
-        printf("\n; Total bytes of code %d, prolog size %d, PerfScore %.2f, instruction count %d (MethodHash=%08x) for "
+        printf("\n; Total bytes of code %d, prolog size %d, PerfScore %.2f, instruction count %d, allocated bytes for code %d (MethodHash=%08x) for "
                "method %s\n",
-               codeSize, prologSize, compiler->info.compPerfScore, instrCount, compiler->info.compMethodHash(),
+               codeSize, prologSize, compiler->info.compPerfScore, instrCount,
+               GetEmitter()->emitTotalHotCodeSize, compiler->info.compMethodHash(),
                compiler->info.compFullName);
         printf("; ============================================================\n\n");
         printf(""); // in our logic this causes a flush
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 8d4cd5e223d540..3a7ed0c360eee1 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12633,10 +12633,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 // Candidate for loop alignment
                 assert(codeGen->ShouldAlignLoops());
                 assert(ig->igFlags & IGF_ALIGN_LOOP);
-#ifdef DEBUG
-                unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
-#else
+#ifdef ADAPTIVE_LOOP_ALIGNMENT
                 unsigned alignmentBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY;
+#else
+                unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
 #endif
                 sz                         = SMALL_IDSC_SIZE;
 
@@ -12647,50 +12647,50 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 }
 
 #if DEBUG
-                bool displayAlignmentDetails = (emitComp->opts.disAsm & emitComp->opts.disAddr) || emitComp->verbose;
+                bool displayAlignmentDetails = (emitComp->opts.disAsm && emitComp->opts.disAddr) || emitComp->verbose;
 #endif
 
 #ifndef ADAPTIVE_LOOP_ALIGNMENT
                 if (emitComp->opts.compJitAlignLoopAdaptive)
 #endif
                 {
+                    bool     skipPadding             = false;
+                    int      maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
+                    unsigned maxLoopSize             = DEFAULT_ALIGN_LOOP_BOUNDARY * maxBlocksAllowedForLoop; 
+
                     // calculate the loop size
                     unsigned  loopSize     = 0;
                     insGroup* loopHeaderIg = ig->igNext;
                     for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
                     {
                         loopSize += igInLoop->igSize;
-                        if ((igInLoop->igLoopBackEdge == loopHeaderIg))
+                        if (igInLoop->igLoopBackEdge == loopHeaderIg || loopSize > maxLoopSize)
                         {
                             break;
                         }
                     }
-                    //TODO: See if comparing loopSize > 128 would be sensible instead?
-                    //TODO: code cleanup
+
                     // Start to align on 32B boundary with a fallback to 16B boundary
-                    alignmentBoundary                = 32;
                     int      minBlocksNeededForLoop  = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
-                    int      maxBlocksAllowedForLoop = genLog2(alignmentBoundary);
-                    unsigned nMaxPaddingBytes        = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop)) - 1;
-                    unsigned nPaddingBytes           = (-(int)(size_t)dst) & (alignmentBoundary - 1);
-                    bool     skipPadding             = false;
+                    unsigned nMaxPaddingBytes        = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
+                    unsigned nPaddingBytes    = (-(int)(size_t)dst) & (alignmentBoundary - 1);
 
-                    if (minBlocksNeededForLoop > maxBlocksAllowedForLoop)
+                    if (loopSize > maxLoopSize)
                     {
                         skipPadding = true;
 #if DEBUG
                         if (displayAlignmentDetails)
                         {
-                            printf("\t\t;; Skip alignment: 'Loopsize= %d bytes.' in (%s)\n",
-                                   loopSize, emitComp->info.compFullName);
+                            printf("\t\t;; Skip alignment: 'Loopsize= %d, MaxLoopSize= %d.' in (%s)\n",
+                                   loopSize, maxLoopSize, emitComp->info.compFullName);
                         }
 #endif
                     }
                     else if (nPaddingBytes > nMaxPaddingBytes)
                     {
-                        // Now try to align to 16B boundary
+                        // Cannot add large padding to align to 32B, so try to align to 16B boundary.
                         alignmentBoundary = 16;
-                        nMaxPaddingBytes  = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop);
+                        nMaxPaddingBytes  = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1);
                         nPaddingBytes     = (-(int)(size_t)dst) & (alignmentBoundary - 1);
 
                         if (nPaddingBytes > nMaxPaddingBytes)
@@ -12708,7 +12708,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 
                     if (!skipPadding && (nPaddingBytes > 0))
                     {
-                        int    extraBytesNotInLoop = (32 * minBlocksNeededForLoop) - loopSize; // Still have it at alignmentboundary=32
+                        size_t extraBytesNotInLoop = (32 * minBlocksNeededForLoop) - loopSize; // Still have it at alignmentboundary=32
                         size_t currentOffset       = (size_t)dst % alignmentBoundary;
 
                         // Padding is needed only if loop starts at or after the current offset.
@@ -12774,8 +12774,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
                     {
                         loopSize += igInLoop->igSize;
-                        if ((igInLoop->igLoopBackEdge == loopHeaderIg) ||
-                            (loopSize > emitComp->opts.compJitAlignLoopMaxCodeSize))
+                        if (igInLoop->igLoopBackEdge == loopHeaderIg ||
+                            loopSize > emitComp->opts.compJitAlignLoopMaxCodeSize)
                         {
                             break;
                         }

From 6bb1a753657474be2938a08e6ff7fddf710a4616 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 6 Nov 2020 16:09:22 -0800
Subject: [PATCH 23/59] jit format

---
 src/coreclr/jit/codegencommon.cpp |  8 +++---
 src/coreclr/jit/codegenlinear.cpp |  3 +--
 src/coreclr/jit/compiler.cpp      |  5 ++--
 src/coreclr/jit/compiler.h        |  8 +++---
 src/coreclr/jit/emit.cpp          | 25 ++++++++++--------
 src/coreclr/jit/emit.h            |  2 +-
 src/coreclr/jit/emitxarch.cpp     | 44 +++++++++++++++++--------------
 src/coreclr/jit/jitconfigvalues.h |  9 ++++---
 src/coreclr/jit/optimizer.cpp     | 11 ++++----
 9 files changed, 60 insertions(+), 55 deletions(-)

diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index 26d2bcae753fb2..795037fc74c8e5 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -2345,11 +2345,11 @@ void CodeGen::genEmitMachineCode()
 #ifdef DEBUG
     if (compiler->opts.disAsm || verbose)
     {
-        printf("\n; Total bytes of code %d, prolog size %d, PerfScore %.2f, instruction count %d, allocated bytes for code %d (MethodHash=%08x) for "
+        printf("\n; Total bytes of code %d, prolog size %d, PerfScore %.2f, instruction count %d, allocated bytes for "
+               "code %d (MethodHash=%08x) for "
                "method %s\n",
-               codeSize, prologSize, compiler->info.compPerfScore, instrCount,
-               GetEmitter()->emitTotalHotCodeSize, compiler->info.compMethodHash(),
-               compiler->info.compFullName);
+               codeSize, prologSize, compiler->info.compPerfScore, instrCount, GetEmitter()->emitTotalHotCodeSize,
+               compiler->info.compMethodHash(), compiler->info.compFullName);
         printf("; ============================================================\n\n");
         printf(""); // in our logic this causes a flush
     }
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index a21be172e813c3..bb379294ed1dd0 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -781,9 +781,8 @@ void CodeGen::genCodeForBBlist()
 
             if ((compiler->opts.compJitAlignLoopBoundary > 16) && (!compiler->opts.compJitAlignLoopAdaptive))
             {
-                //TODO: Only do this if we are confident that the loop size doesn't exceed the heuristics threshold
+                // TODO: Only do this if we are confident that the loop size doesn't exceed the heuristics threshold
                 GetEmitter()->emitVariableLoopAlign();
-
             }
             else
 #endif
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 9b82a549e1d9e6..e997e87a43b0ea 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2619,12 +2619,11 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     opts.compJitAlignLoopMinBlockWeight = JitConfig.JitAlignLoopMinBlockWeight();
     opts.compJitAlignLoopMaxCodeSize    = JitConfig.JitAlignLoopMaxCodeSize();
     opts.compJitAlignLoopBoundary       = JitConfig.JitAlignLoopBoundary();
-    opts.compJitAlignLoopForJcc = JitConfig.JitAlignLoopForJcc() == 1;
-    opts.compJitAlignLoopAdaptive = JitConfig.JitAlignLoopAdaptive() == 1;
+    opts.compJitAlignLoopForJcc         = JitConfig.JitAlignLoopForJcc() == 1;
+    opts.compJitAlignLoopAdaptive       = JitConfig.JitAlignLoopAdaptive() == 1;
     assert(isPow2(opts.compJitAlignLoopBoundary));
 #endif
 
-
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
     // We never want to have debugging enabled when regenerating GC encoding patterns
     opts.compDbgCode = false;
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 1480a0025ff93d..fe7dd7314a6bd6 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -9047,12 +9047,12 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
                                                  // alignment will be done.
         unsigned compJitAlignLoopBoundary;       // For non-adaptive alignment, address boundary (power of 2) at which
                                                  // loop alignment should be done. By default, 32B.
-        bool     compJitAlignLoopForJcc;         // If set, for non-adaptive alignment, ensure loop jmps are not on or
+        bool compJitAlignLoopForJcc;             // If set, for non-adaptive alignment, ensure loop jmps are not on or
                                                  // cross alignment boundary.
-        bool     compJitAlignLoopAdaptive;       // If set, perform adaptive loop alignment that limits number of padding
-                                                 // based on loop size.
+        bool compJitAlignLoopAdaptive; // If set, perform adaptive loop alignment that limits number of padding
+                                       // based on loop size.
 #else
-        #define ADAPTIVE_LOOP_ALIGNMENT
+#define ADAPTIVE_LOOP_ALIGNMENT
 #endif
 
 #ifdef LATE_DISASM
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 12f6ed8b52600e..3f3084f2f1804d 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -4804,7 +4804,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
     // they are larger than 16 bytes and contain a loop.
     //
     if (
-        //emitComp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER1) &&
+        // emitComp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER1) &&
         emitComp->opts.OptimizationEnabled() && !emitComp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT) &&
         (emitTotalHotCodeSize > 16) && emitComp->fgHasLoops)
     {
@@ -5217,12 +5217,12 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
         for (unsigned cnt = ig->igInsCnt; cnt; cnt--)
         {
 #ifdef DEBUG
-            size_t lastCp = (size_t) cp;
+            size_t     lastCp = (size_t)cp;
             instrDesc* lastId = id;
 #endif
             castto(id, BYTE*) += emitIssue1Instr(ig, id, &cp);
 #ifdef DEBUG
-            
+
             if ((emitComp->opts.disAsm || emitComp->verbose) && emitComp->opts.disAddr)
             {
                 size_t lastBoundaryAddr = (size_t)cp & ~((size_t)emitComp->opts.compJitAlignLoopBoundary - 1);
@@ -5236,12 +5236,13 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
 #if defined(TARGET_XARCH)
                     // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
                     bool isJccAffectedIns = ((lastIns >= INS_i_jmp && lastIns < INS_align) || (lastIns == INS_call) ||
-                                            (lastIns == INS_ret));
+                                             (lastIns == INS_ret));
                     if (cnt)
                     {
                         instruction currIns = id->idIns();
-                        if ((lastIns == INS_cmp) || (lastIns == INS_test) || (lastIns == INS_add) || (lastIns == INS_sub) ||
-                            (lastIns == INS_and) || (lastIns == INS_inc) || (lastIns == INS_dec))
+                        if ((lastIns == INS_cmp) || (lastIns == INS_test) || (lastIns == INS_add) ||
+                            (lastIns == INS_sub) || (lastIns == INS_and) || (lastIns == INS_inc) ||
+                            (lastIns == INS_dec))
                         {
                             isJccAffectedIns |= (currIns >= INS_i_jmp && currIns < INS_align);
                         }
@@ -5254,13 +5255,15 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
                     unsigned bytesCrossedBoundary = ((size_t)cp & 0x1f);
                     if ((bytesCrossedBoundary != 0) || (isJccAffectedIns && bytesCrossedBoundary == 0))
                     {
-                        printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", codeGen->genInsName(lastId->idIns()), bytesCrossedBoundary);
+                        printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", codeGen->genInsName(lastId->idIns()),
+                               bytesCrossedBoundary);
                     }
                     else
                     {
                         printf("...............................");
                     }
-                    printf(" %dB boundary ...............................\n", (emitComp->opts.compJitAlignLoopBoundary));
+                    printf(" %dB boundary ...............................\n",
+                           (emitComp->opts.compJitAlignLoopBoundary));
                 }
             }
 #endif
@@ -7269,9 +7272,9 @@ void emitter::emitInitIG(insGroup* ig)
        sure we act the same in non-DEBUG builds.
     */
 
-    ig->igSize   = 0;
-    ig->igGCregs = RBM_NONE;
-    ig->igInsCnt = 0;
+    ig->igSize         = 0;
+    ig->igGCregs       = RBM_NONE;
+    ig->igInsCnt       = 0;
     ig->igLoopBackEdge = nullptr;
 }
 
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index f1fab43c078f98..6aaec2a17bcb55 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -309,7 +309,7 @@ struct insGroup
 #endif
 
     unsigned char igInsCnt; // # of instructions  in this group
-    // TODO: Add loopBackEdge?
+                            // TODO: Add loopBackEdge?
 
 #endif // REGMASK_BITS
 
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 3a7ed0c360eee1..97b082bd5ab00d 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2679,7 +2679,7 @@ void emitter::emitLoopAlign()
 
 void emitter::emitVariableLoopAlign()
 {
-    unsigned insAlignCount = (emitComp->opts.compJitAlignLoopBoundary - 1) / 15;
+    unsigned insAlignCount    = (emitComp->opts.compJitAlignLoopBoundary - 1) / 15;
     unsigned lastInsAlignSize = (emitComp->opts.compJitAlignLoopBoundary - 1) % 15;
 
     while (insAlignCount)
@@ -12638,7 +12638,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 #else
                 unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
 #endif
-                sz                         = SMALL_IDSC_SIZE;
+                sz = SMALL_IDSC_SIZE;
 
                 // If already at alignment boundary, no need to emit anything.
                 if (((size_t)dst & (alignmentBoundary - 1)) == 0)
@@ -12656,7 +12656,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 {
                     bool     skipPadding             = false;
                     int      maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
-                    unsigned maxLoopSize             = DEFAULT_ALIGN_LOOP_BOUNDARY * maxBlocksAllowedForLoop; 
+                    unsigned maxLoopSize             = DEFAULT_ALIGN_LOOP_BOUNDARY * maxBlocksAllowedForLoop;
 
                     // calculate the loop size
                     unsigned  loopSize     = 0;
@@ -12671,9 +12671,9 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     }
 
                     // Start to align on 32B boundary with a fallback to 16B boundary
-                    int      minBlocksNeededForLoop  = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
-                    unsigned nMaxPaddingBytes        = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
-                    unsigned nPaddingBytes    = (-(int)(size_t)dst) & (alignmentBoundary - 1);
+                    int      minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+                    unsigned nMaxPaddingBytes       = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
+                    unsigned nPaddingBytes          = (-(int)(size_t)dst) & (alignmentBoundary - 1);
 
                     if (loopSize > maxLoopSize)
                     {
@@ -12681,8 +12681,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 #if DEBUG
                         if (displayAlignmentDetails)
                         {
-                            printf("\t\t;; Skip alignment: 'Loopsize= %d, MaxLoopSize= %d.' in (%s)\n",
-                                   loopSize, maxLoopSize, emitComp->info.compFullName);
+                            printf("\t\t;; Skip alignment: 'Loopsize= %d, MaxLoopSize= %d.' in (%s)\n", loopSize,
+                                   maxLoopSize, emitComp->info.compFullName);
                         }
 #endif
                     }
@@ -12699,8 +12699,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 #if DEBUG
                             if (displayAlignmentDetails)
                             {
-                                printf("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, AlignmentBoundary= %dB.' in (%s)\n",
-                                    nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary, emitComp->info.compFullName);
+                                printf("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
+                                       "AlignmentBoundary= %dB.' in (%s)\n",
+                                       nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary,
+                                       emitComp->info.compFullName);
                             }
 #endif
                         }
@@ -12708,8 +12710,9 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 
                     if (!skipPadding && (nPaddingBytes > 0))
                     {
-                        size_t extraBytesNotInLoop = (32 * minBlocksNeededForLoop) - loopSize; // Still have it at alignmentboundary=32
-                        size_t currentOffset       = (size_t)dst % alignmentBoundary;
+                        size_t extraBytesNotInLoop =
+                            (32 * minBlocksNeededForLoop) - loopSize; // Still have it at alignmentboundary=32
+                        size_t currentOffset = (size_t)dst % alignmentBoundary;
 
                         // Padding is needed only if loop starts at or after the current offset.
                         // Otherwise, the loop just fits in minBlocksNeededForLoop and so no alignment needed.
@@ -12731,8 +12734,10 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 #if DEBUG
                             if (displayAlignmentDetails)
                             {
-                                printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, MinBlocks= %d, AlignmentBoundary= %dB.' in (%s)\n",
-                                       nPaddingBytes, loopSize, minBlocksNeededForLoop, alignmentBoundary, emitComp->info.compFullName);
+                                printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, MinBlocks= %d, "
+                                       "AlignmentBoundary= %dB.' in (%s)\n",
+                                       nPaddingBytes, loopSize, minBlocksNeededForLoop, alignmentBoundary,
+                                       emitComp->info.compFullName);
                             }
 #endif
                         }
@@ -12744,7 +12749,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                         id->idCodeSize(0);
                         ig->igFlags |= IGF_UPD_ISZ;
                     }
-                    else if(nPaddingBytes != id->idCodeSize())
+                    else if (nPaddingBytes != id->idCodeSize())
                     {
                         id->idCodeSize(nPaddingBytes);
                         ig->igFlags |= IGF_UPD_ISZ;
@@ -12833,8 +12838,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                             if (displayAlignmentDetails)
                             {
                                 printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, MinBlocks= %d, "
-                                       "AlignmentBoundary= %dB.' in (%s)\n", nBytes,
-                                       loopSize, minimumBlocksNeeded, alignmentBoundary,
+                                       "AlignmentBoundary= %dB.' in (%s)\n",
+                                       nBytes, loopSize, minimumBlocksNeeded, alignmentBoundary,
                                        emitComp->info.compFullName);
                             }
                             // In the end dst should be at alignment boundary
@@ -12845,9 +12850,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     {
                         if (displayAlignmentDetails)
                         {
-                            printf("\t\t;; Skip alignment: 'Loopsize= %d, AllowedMaxSize= %d.' in (%s)\n",
-                                   loopSize, emitComp->opts.compJitAlignLoopMaxCodeSize,
-                                   emitComp->info.compFullName);
+                            printf("\t\t;; Skip alignment: 'Loopsize= %d, AllowedMaxSize= %d.' in (%s)\n", loopSize,
+                                   emitComp->opts.compJitAlignLoopMaxCodeSize, emitComp->info.compFullName);
                         }
                         // If next instruction is align, skip it so
                         // we do not check the heuristics again.
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 547900c420884d..3d65ca1a7d32a3 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -47,7 +47,8 @@ CONFIG_INTEGER(JitAlignLoopMinBlockWeight,
                                                     // candidate for alignment.
 CONFIG_INTEGER(JitAlignLoopMaxCodeSize,
                W("JitAlignLoopMaxCodeSize"),
-               0x60)  // For non-adaptive alignment, minimum loop size (in bytes) for which alignment will be done. Defaults to 3 blocks of 32 bytes chunks = 96 bytes.
+               0x60) // For non-adaptive alignment, minimum loop size (in bytes) for which alignment will be done.
+                     // Defaults to 3 blocks of 32 bytes chunks = 96 bytes.
 CONFIG_INTEGER(JitAlignLoopBoundary,
                W("JitAlignLoopBoundary"),
                DEFAULT_ALIGN_LOOP_BOUNDARY) // For non-adaptive alignment, address boundary (power of 2) at which loop
@@ -57,8 +58,8 @@ CONFIG_INTEGER(JitAlignLoopForJcc,
                0) // If set, for non-adaptive alignment, ensure loop jmps are not on or cross alignment boundary.
 
 CONFIG_INTEGER(JitAlignLoopAdaptive,
-            W("JitAlignLoopAdaptive"),
-            1) // If set, perform adaptive loop alignment that limits number of padding based on loop size.
+               W("JitAlignLoopAdaptive"),
+               1) // If set, perform adaptive loop alignment that limits number of padding based on loop size.
 
 CONFIG_INTEGER(JitDirectAlloc, W("JitDirectAlloc"), 0)
 CONFIG_INTEGER(JitDoubleAlign, W("JitDoubleAlign"), 1)
@@ -223,7 +224,7 @@ CONFIG_INTEGER(EnableIncompleteISAClass, W("EnableIncompleteISAClass"), 0) // En
 
 CONFIG_INTEGER(JitAlignLoops, W("JitAlignLoops"), 1) // If set, align inner loops
 
-                                                    ///
+///
 /// JIT
 ///
 #ifdef FEATURE_ENABLE_NO_RANGE_CHECKS
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 0c282baf83d4b2..56a91bbe33f3ef 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2547,14 +2547,13 @@ void Compiler::optFindNaturalLoops()
         if (codeGen->ShouldAlignLoops())
         {
             // An innerloop candidate that might need alignment
-            if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) &&
-                (
+            if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) && (
 #ifdef ADAPTIVE_LOOP_ALIGNMENT
-                    DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT
+                                                                                  DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT
 #else
-                    opts.compJitAlignLoopMinBlockWeight
+                                                                                  opts.compJitAlignLoopMinBlockWeight
 #endif
-                    <= first->getBBWeight(this)))
+                                                                                  <= first->getBBWeight(this)))
             {
                 first->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
             }
@@ -2916,7 +2915,7 @@ bool Compiler::optCanonicalizeLoop(unsigned char loopInd)
     {
         optLoopTable[loopInd].lpEntry = newT;
     }
-    //assert((optLoopTable[loopInd].lpFirst->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP) == 0);
+    // assert((optLoopTable[loopInd].lpFirst->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP) == 0);
     optLoopTable[loopInd].lpTop   = newT;
     optLoopTable[loopInd].lpFirst = newT;
     // Something to investigate

From 03f6ba61b3417587a1e38070b679eec1252f0ccc Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 6 Nov 2020 16:24:26 -0800
Subject: [PATCH 24/59] one more build error

---
 src/coreclr/jit/emitxarch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 97b082bd5ab00d..fa001ffe07db7a 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12790,7 +12790,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     if (loopSize <= emitComp->opts.compJitAlignLoopMaxCodeSize)
                     {
                         int    minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
-                        int    extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
+                        size_t extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
                         size_t currentOffset       = (size_t)dst % alignmentBoundary;
 
                         // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary

From 8036061a2d5d5a7218c27b1d9624f755d4bd2e81 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 11 Nov 2020 08:33:59 -0800
Subject: [PATCH 25/59] Add emitLoopAlignAdjustments()

---
 src/coreclr/jit/codegencommon.cpp |   6 +-
 src/coreclr/jit/codegenlinear.cpp |   1 +
 src/coreclr/jit/emit.cpp          | 135 ++++++++++++++++++++++++++++++
 src/coreclr/jit/emit.h            |   1 +
 src/coreclr/jit/emitxarch.cpp     |  37 ++++++--
 src/coreclr/jit/optimizer.cpp     |  11 +--
 6 files changed, 176 insertions(+), 15 deletions(-)

diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index 795037fc74c8e5..2dcbced91d7518 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -2258,6 +2258,9 @@ void CodeGen::genGenerateMachineCode()
 
     GetEmitter()->emitJumpDistBind();
 
+    /* Perform alignment adjustments */
+    GetEmitter()->emitLoopAlignAdjustments();
+
     /* The code is now complete and final; it should not change after this. */
 }
 
@@ -2348,7 +2351,8 @@ void CodeGen::genEmitMachineCode()
         printf("\n; Total bytes of code %d, prolog size %d, PerfScore %.2f, instruction count %d, allocated bytes for "
                "code %d (MethodHash=%08x) for "
                "method %s\n",
-               codeSize, prologSize, compiler->info.compPerfScore, instrCount, GetEmitter()->emitTotalHotCodeSize,
+               codeSize, prologSize, compiler->info.compPerfScore, instrCount,
+               GetEmitter()->emitTotalHotCodeSize + GetEmitter()->emitTotalColdCodeSize,
                compiler->info.compMethodHash(), compiler->info.compFullName);
         printf("; ============================================================\n\n");
         printf(""); // in our logic this causes a flush
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index bb379294ed1dd0..5943a7649ef90a 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -773,6 +773,7 @@ void CodeGen::genCodeForBBlist()
         if ((block->bbNext != nullptr) && (block->bbNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP))
         {
             assert(ShouldAlignLoops());
+
 #ifndef ADAPTIVE_LOOP_ALIGNMENT
             if (verbose)
             {
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 3f3084f2f1804d..6920d3d92d22ef 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -3665,6 +3665,8 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
         }
 #endif // DEBUG_EMIT
 
+        //assert(id->idCodeSize() == csz);
+
         /* The instruction size estimate wasn't accurate; remember this */
 
         ig->igFlags |= IGF_UPD_ISZ;
@@ -4487,6 +4489,139 @@ void emitter::emitJumpDistBind()
 #endif // DEBUG
 }
 
+void emitter::emitLoopAlignAdjustments()
+{
+#ifdef TARGET_XARCH
+
+#ifdef ADAPTIVE_LOOP_ALIGNMENT
+    unsigned alignmentBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY;
+#else
+    unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
+#endif
+    unsigned removeAlignment         = 0;
+    bool     skipPadding             = false;
+    int      maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
+    unsigned maxLoopSize             = DEFAULT_ALIGN_LOOP_BOUNDARY * maxBlocksAllowedForLoop;
+    size_t   dst                     = 0, of = 0;
+    unsigned minBlocksNeededForLoop = 0, nMaxPaddingBytes = 0, nPaddingBytes = 0, loopSize = 0;
+    for (insGroup* ig = emitIGlist; ig != nullptr; ig = ig->igNext)
+    {
+        ig->igOffs -= removeAlignment;
+        dst += ig->igSize;
+
+        // Below is not needed because we just care about the igSize
+        // and that gets adjusted in emitJumpDst when we add IGF_UPD_ISZ
+        //// recalculate the size
+        //if ((ig->igFlags & IGF_UPD_ISZ) != 0)
+        //{
+        //    igSize = emitFindOffset(ig, ig->igInsCnt);
+        //    assert(igSize == ig->igSize);
+        //}
+        //else
+        //{
+        //    igSize = ig->igSize;
+        //}
+
+        //if (emitComp->compMethodID == 37683)
+        //{
+        //    unsigned       insNum = ig->igInsCnt;
+        //    instrDesc*     id     = (instrDesc*)ig->igData;
+
+        //    /* Walk the instruction list until all are counted */
+
+        //    while (insNum > 0)
+        //    {
+        //        unsigned currSize = id->idCodeSize();
+
+
+        //        printf("[%04XH] size = %d -- ", of, id->idCodeSize());
+        //        of += currSize;
+
+        //        emitDispIns(id, true, false, false);
+
+        //        castto(id, BYTE*) += emitSizeOfInsDsc(id);
+
+        //        insNum--;
+        //    }
+        //}
+
+        //dst += igSize;
+
+        if (!(ig->igFlags & IGF_ALIGN_LOOP))
+        {
+            continue;
+        }
+
+        // TODO: Add logging for Skip/Add?
+        // TODO: I am about to align so (dst -= 15)
+
+        if ((dst & (alignmentBoundary - 1)) == 0)
+        {
+            skipPadding = true;
+        }
+        else
+        {
+            loopSize               = 0;
+            insGroup* loopHeaderIg = ig->igNext;
+            for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
+            {
+                loopSize += igInLoop->igSize;
+                if (igInLoop->igLoopBackEdge == loopHeaderIg)
+                {
+                    break;
+                }
+            }
+
+            minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+            nMaxPaddingBytes       = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
+            nPaddingBytes          = (-(int)dst) & (alignmentBoundary - 1);
+
+            if (loopSize > maxLoopSize)
+            {
+                skipPadding = true;
+            }
+            else if (nPaddingBytes > nMaxPaddingBytes)
+            {
+                alignmentBoundary = 16;
+                nMaxPaddingBytes  = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1);
+                nPaddingBytes     = (-(int)dst) & (alignmentBoundary - 1);
+
+                if (nPaddingBytes > nMaxPaddingBytes)
+                {
+                    //skipPadding = true;
+                }
+            }
+        }
+
+        if (!skipPadding && (nPaddingBytes > 0))
+        {
+            size_t extraBytesNotInLoop =
+                (32 * minBlocksNeededForLoop) - loopSize;   // Still have it at alignmentboundary=32
+            size_t currentOffset = dst % alignmentBoundary; // TODO: Change to & (boundary - 1)
+            if (currentOffset <= extraBytesNotInLoop)
+            {
+                //skipPadding = true;
+
+                // TODO: Detect actual no. of padding bytes.
+                // TODO: Figure out how to update size of align instructions so they just add padding in emitter.
+            }
+        }
+
+        if (skipPadding)
+        {
+            dst -= 15;
+            ig->igSize -= 15;
+            ig->igFlags |= IGF_UPD_ISZ;
+            removeAlignment += 15;
+            emitTotalCodeSize -= 15;
+
+            // remove the flag
+            ig->igFlags &= ~IGF_ALIGN_LOOP;
+        }
+    }
+#endif
+}
+
 void emitter::emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG)
 {
 #ifdef DEBUG
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 6aaec2a17bcb55..a609e2e2e02206 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -1740,6 +1740,7 @@ class emitter
     instrDescJmp* emitJumpList;       // list of local jumps in method
     instrDescJmp* emitJumpLast;       // last of local jumps in method
     void          emitJumpDistBind(); // Bind all the local jumps in method
+    void          emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments
 
     void emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG); // Check for illegal branches between funclets
 
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index fa001ffe07db7a..46204082801ccb 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12632,7 +12632,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             {
                 // Candidate for loop alignment
                 assert(codeGen->ShouldAlignLoops());
-                assert(ig->igFlags & IGF_ALIGN_LOOP);
+                
 #ifdef ADAPTIVE_LOOP_ALIGNMENT
                 unsigned alignmentBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY;
 #else
@@ -12640,16 +12640,34 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 #endif
                 sz = SMALL_IDSC_SIZE;
 
+#if DEBUG
+                bool displayAlignmentDetails =
+                    (emitComp->opts.disAsm /*&& emitComp->opts.disAddr*/) || emitComp->verbose;
+#endif
+               if ((ig->igFlags & IGF_ALIGN_LOOP) == 0)
+                {
+                    id->idCodeSize(0);
+                    ig->igFlags |= IGF_UPD_ISZ;
+                    if (displayAlignmentDetails)
+                    {
+                        printf("\t\t;; Skip alignment: 'Big loop.' in (%s)\n", emitComp->info.compFullName);
+                    }
+                    break;
+                }
+
                 // If already at alignment boundary, no need to emit anything.
                 if (((size_t)dst & (alignmentBoundary - 1)) == 0)
                 {
+                    id->idCodeSize(0);
+                    ig->igFlags |= IGF_UPD_ISZ;
+                    if (displayAlignmentDetails)
+                    {
+                        printf("\t\t;; Skip alignment: 'Loop already aligned at boundary.' in (%s)\n",
+                               emitComp->info.compMethodName);
+                    }
                     break;
                 }
 
-#if DEBUG
-                bool displayAlignmentDetails = (emitComp->opts.disAsm && emitComp->opts.disAddr) || emitComp->verbose;
-#endif
-
 #ifndef ADAPTIVE_LOOP_ALIGNMENT
                 if (emitComp->opts.compJitAlignLoopAdaptive)
 #endif
@@ -12678,12 +12696,13 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     if (loopSize > maxLoopSize)
                     {
                         skipPadding = true;
+                        assert(!"This should have checked before!");
 #if DEBUG
-                        if (displayAlignmentDetails)
+                        /*if (displayAlignmentDetails)
                         {
-                            printf("\t\t;; Skip alignment: 'Loopsize= %d, MaxLoopSize= %d.' in (%s)\n", loopSize,
-                                   maxLoopSize, emitComp->info.compFullName);
-                        }
+                            printf("\t\t;; Skip alignment: 'Loopsize= %d, MaxLoopSize= %d, Estimated= %d.' in (%s)\n", loopSize,
+                                   maxLoopSize, ig->loopSize, emitComp->info.compFullName);
+                        }*/
 #endif
                     }
                     else if (nPaddingBytes > nMaxPaddingBytes)
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 56a91bbe33f3ef..703ca580225117 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2546,14 +2546,15 @@ void Compiler::optFindNaturalLoops()
 #if defined(TARGET_XARCH)
         if (codeGen->ShouldAlignLoops())
         {
-            // An innerloop candidate that might need alignment
-            if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) && (
 #ifdef ADAPTIVE_LOOP_ALIGNMENT
-                                                                                  DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT
+            unsigned minBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT;
 #else
-                                                                                  opts.compJitAlignLoopMinBlockWeight
+            unsigned minBlockWeight = opts.compJitAlignLoopMinBlockWeight;
 #endif
-                                                                                  <= first->getBBWeight(this)))
+
+            // An innerloop candidate that might need alignment
+            if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) &&
+                minBlockWeight <= first->getBBWeight(this))
             {
                 first->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
             }

From 809fc85a53475097a7c310b150857eba4d1aef1e Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 11 Nov 2020 08:55:42 -0800
Subject: [PATCH 26/59] Update emitLoopAlignAdjustments to just include
 loopSize calc

---
 src/coreclr/jit/emit.cpp | 97 +++++++++++++++++++++++-----------------
 1 file changed, 55 insertions(+), 42 deletions(-)

diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 6920d3d92d22ef..58031d6ec66e71 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -4493,21 +4493,34 @@ void emitter::emitLoopAlignAdjustments()
 {
 #ifdef TARGET_XARCH
 
-#ifdef ADAPTIVE_LOOP_ALIGNMENT
-    unsigned alignmentBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY;
-#else
-    unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
-#endif
-    unsigned removeAlignment         = 0;
-    bool     skipPadding             = false;
-    int      maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
-    unsigned maxLoopSize             = DEFAULT_ALIGN_LOOP_BOUNDARY * maxBlocksAllowedForLoop;
-    size_t   dst                     = 0, of = 0;
-    unsigned minBlocksNeededForLoop = 0, nMaxPaddingBytes = 0, nPaddingBytes = 0, loopSize = 0;
+    unsigned maxLoopSize = 0, alignmentBoundary = 0;
+#ifndef ADAPTIVE_LOOP_ALIGNMENT
+    if (emitComp->opts.compJitAlignLoopAdaptive)
+#endif
+    {
+        // For adaptive, adjust the loop size depending on the alignment boundary
+        int maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
+        alignmentBoundary           = DEFAULT_ALIGN_LOOP_BOUNDARY;
+        maxLoopSize                 = alignmentBoundary * maxBlocksAllowedForLoop;
+    }
+#ifndef ADAPTIVE_LOOP_ALIGNMENT
+    else
+    {
+        // For non-adaptive, just take whatever is supplied using COMPlus_ variables
+        alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
+        maxLoopSize       = emitComp->opts.compJitAlignLoopMaxCodeSize;
+    }
+#endif
+
+    unsigned alignBytesRemoved = 0, loopSize;
+    bool     skipPadding       = false;
+    
+    /*size_t   dst                     = 0, of = 0;
+    unsigned minBlocksNeededForLoop = 0, nMaxPaddingBytes = 0, nPaddingBytes = 0, loopSize = 0;*/
     for (insGroup* ig = emitIGlist; ig != nullptr; ig = ig->igNext)
     {
-        ig->igOffs -= removeAlignment;
-        dst += ig->igSize;
+        ig->igOffs -= alignBytesRemoved;
+        //dst += ig->igSize;
 
         // Below is not needed because we just care about the igSize
         // and that gets adjusted in emitJumpDst when we add IGF_UPD_ISZ
@@ -4555,64 +4568,64 @@ void emitter::emitLoopAlignAdjustments()
         // TODO: Add logging for Skip/Add?
         // TODO: I am about to align so (dst -= 15)
 
-        if ((dst & (alignmentBoundary - 1)) == 0)
+       /* if ((dst & (alignmentBoundary - 1)) == 0)
         {
             skipPadding = true;
         }
-        else
+        else*/
         {
             loopSize               = 0;
             insGroup* loopHeaderIg = ig->igNext;
             for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
             {
                 loopSize += igInLoop->igSize;
-                if (igInLoop->igLoopBackEdge == loopHeaderIg)
+                if (igInLoop->igLoopBackEdge == loopHeaderIg || loopSize > maxLoopSize)
                 {
                     break;
                 }
             }
 
-            minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+           /* minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
             nMaxPaddingBytes       = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
-            nPaddingBytes          = (-(int)dst) & (alignmentBoundary - 1);
+            nPaddingBytes          = (-(int)dst) & (alignmentBoundary - 1);*/
 
             if (loopSize > maxLoopSize)
             {
                 skipPadding = true;
             }
-            else if (nPaddingBytes > nMaxPaddingBytes)
-            {
-                alignmentBoundary = 16;
-                nMaxPaddingBytes  = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1);
-                nPaddingBytes     = (-(int)dst) & (alignmentBoundary - 1);
+            //else if (nPaddingBytes > nMaxPaddingBytes)
+            //{
+            //    alignmentBoundary = 16;
+            //    nMaxPaddingBytes  = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1);
+            //    nPaddingBytes     = (-(int)dst) & (alignmentBoundary - 1);
 
-                if (nPaddingBytes > nMaxPaddingBytes)
-                {
-                    //skipPadding = true;
-                }
-            }
+            //    if (nPaddingBytes > nMaxPaddingBytes)
+            //    {
+            //        //skipPadding = true;
+            //    }
+            //}
         }
 
-        if (!skipPadding && (nPaddingBytes > 0))
-        {
-            size_t extraBytesNotInLoop =
-                (32 * minBlocksNeededForLoop) - loopSize;   // Still have it at alignmentboundary=32
-            size_t currentOffset = dst % alignmentBoundary; // TODO: Change to & (boundary - 1)
-            if (currentOffset <= extraBytesNotInLoop)
-            {
-                //skipPadding = true;
+        //if (!skipPadding && (nPaddingBytes > 0))
+        //{
+        //    size_t extraBytesNotInLoop =
+        //        (32 * minBlocksNeededForLoop) - loopSize;   // Still have it at alignmentboundary=32
+        //    size_t currentOffset = dst % alignmentBoundary; // TODO: Change to & (boundary - 1)
+        //    if (currentOffset <= extraBytesNotInLoop)
+        //    {
+        //        //skipPadding = true;
 
-                // TODO: Detect actual no. of padding bytes.
-                // TODO: Figure out how to update size of align instructions so they just add padding in emitter.
-            }
-        }
+        //        // TODO: Detect actual no. of padding bytes.
+        //        // TODO: Figure out how to update size of align instructions so they just add padding in emitter.
+        //    }
+        //}
 
         if (skipPadding)
         {
-            dst -= 15;
+            //dst -= 15;
             ig->igSize -= 15;
             ig->igFlags |= IGF_UPD_ISZ;
-            removeAlignment += 15;
+            alignBytesRemoved += 15;
             emitTotalCodeSize -= 15;
 
             // remove the flag

From 5b9b7a07d442d4ae2a9a26b3950b83ac56c5345d Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 11 Nov 2020 13:08:54 -0800
Subject: [PATCH 27/59] Remove #ifdef ADAPTIVE_LOOP_ALIGNMENT

---
 src/coreclr/jit/codegenlinear.cpp |  15 +-
 src/coreclr/jit/compiler.cpp      |  12 +-
 src/coreclr/jit/compiler.h        |  20 +-
 src/coreclr/jit/emit.cpp          | 151 +++-----
 src/coreclr/jit/emit.h            |   4 +-
 src/coreclr/jit/emitxarch.cpp     | 583 +++++++++++++++++++++---------
 src/coreclr/jit/emitxarch.h       |   5 +-
 src/coreclr/jit/jitconfigvalues.h |   3 +-
 src/coreclr/jit/optimizer.cpp     |   8 +-
 9 files changed, 478 insertions(+), 323 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index 5943a7649ef90a..f721d0e7d513eb 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -774,19 +774,12 @@ void CodeGen::genCodeForBBlist()
         {
             assert(ShouldAlignLoops());
 
-#ifndef ADAPTIVE_LOOP_ALIGNMENT
-            if (verbose)
-            {
-                printf("Adding 'align' instruction to align loop header block " FMT_BB, block->bbNext->bbNum);
-            }
-
             if ((compiler->opts.compJitAlignLoopBoundary > 16) && (!compiler->opts.compJitAlignLoopAdaptive))
             {
                 // TODO: Only do this if we are confident that the loop size doesn't exceed the heuristics threshold
-                GetEmitter()->emitVariableLoopAlign();
+                GetEmitter()->emitVariableLoopAlign(compiler->opts.compJitAlignLoopBoundary);
             }
             else
-#endif
             {
                 GetEmitter()->emitLoopAlign();
             }
@@ -794,6 +787,12 @@ void CodeGen::genCodeForBBlist()
             // Mark this IG as need alignment so during emitter we can check the instruction count heuristics of
             // all IGs that follows this IG and participate in a loop.
             GetEmitter()->emitCurIG->igFlags |= IGF_ALIGN_LOOP;
+
+            if (verbose)
+            {
+                printf("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u to align loop header block.\n" FMT_BB,
+                       compiler->opts.compJitAlignLoopBoundary, compiler->compMethodID, GetEmitter()->emitCurIG->igNum);
+            }
         }
 #endif
 
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index e997e87a43b0ea..53046d392791fa 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2616,12 +2616,18 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     opts.compDbgEnC  = jitFlags->IsSet(JitFlags::JIT_FLAG_DEBUG_EnC);
 
 #ifdef DEBUG
+    opts.compJitAlignLoopAdaptive       = JitConfig.JitAlignLoopAdaptive() == 1;
+    opts.compJitAlignLoopBoundary       = ReinterpretHexAsDecimal(JitConfig.JitAlignLoopBoundary());
     opts.compJitAlignLoopMinBlockWeight = JitConfig.JitAlignLoopMinBlockWeight();
-    opts.compJitAlignLoopMaxCodeSize    = JitConfig.JitAlignLoopMaxCodeSize();
-    opts.compJitAlignLoopBoundary       = JitConfig.JitAlignLoopBoundary();
+
     opts.compJitAlignLoopForJcc         = JitConfig.JitAlignLoopForJcc() == 1;
-    opts.compJitAlignLoopAdaptive       = JitConfig.JitAlignLoopAdaptive() == 1;
+    opts.compJitAlignLoopMaxCodeSize    = JitConfig.JitAlignLoopMaxCodeSize();
+
     assert(isPow2(opts.compJitAlignLoopBoundary));
+#else
+    opts.compJitAlignLoopAdaptive = true;
+    opts.compJitAlignLoopBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY;
+    opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT;
 #endif
 
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index fe7dd7314a6bd6..261057485b87e6 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -9038,22 +9038,24 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 
 #define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 10
 #define DEFAULT_ALIGN_LOOP_BOUNDARY 32
+#define DEFAULT_MAX_LOOPSIZE_FOR_ALIGN DEFAULT_ALIGN_LOOP_BOUNDARY * 3
 
 #ifdef DEBUG
         // Loop alignment variables
-        unsigned compJitAlignLoopMinBlockWeight; // Minimum weight needed for the first block of a loop to make it a
-                                                 // candidate for alignment.
-        unsigned compJitAlignLoopMaxCodeSize;    // For non-adaptive alignment, minimum loop size (in bytes) for which
-                                                 // alignment will be done.
-        unsigned compJitAlignLoopBoundary;       // For non-adaptive alignment, address boundary (power of 2) at which
-                                                 // loop alignment should be done. By default, 32B.
         bool compJitAlignLoopForJcc;             // If set, for non-adaptive alignment, ensure loop jmps are not on or
                                                  // cross alignment boundary.
+#endif
+        unsigned compJitAlignLoopMaxCodeSize; // For non-adaptive alignment, minimum loop size (in bytes) for which
+                                              // alignment will be done.
+
+        unsigned compJitAlignLoopMinBlockWeight; // Minimum weight needed for the first block of a loop to make it a
+                                                 // candidate for alignment.
+
+        unsigned compJitAlignLoopBoundary; // For non-adaptive alignment, address boundary (power of 2) at which
+                                           // loop alignment should be done. By default, 32B.
+
         bool compJitAlignLoopAdaptive; // If set, perform adaptive loop alignment that limits number of padding
                                        // based on loop size.
-#else
-#define ADAPTIVE_LOOP_ALIGNMENT
-#endif
 
 #ifdef LATE_DISASM
         bool doLateDisasm; // Run the late disassembler
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 58031d6ec66e71..3951453a9c818f 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -4489,147 +4489,78 @@ void emitter::emitJumpDistBind()
 #endif // DEBUG
 }
 
+
+/*****************************************************************************
+ *  For loopHeaderIg, find the size of the smallest possible loop that doesn't exceed maxLoopSize.
+ */
+
+unsigned emitter::getLoopSize(insGroup* loopHeaderIg, unsigned maxLoopSize)
+{
+    unsigned  loopSize     = 0;
+
+    for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
+    {
+        loopSize += igInLoop->igSize;
+        if (igInLoop->igLoopBackEdge == loopHeaderIg || loopSize > maxLoopSize)
+        {
+            break;
+        }
+    }
+
+    return loopSize;
+}
+
+/*****************************************************************************
+ *  For IGs that adds padding to align loops, calculate the loop size and if it exceed the
+    threshold, then mark that alignment is not needed and hence adjust the igOffs, igSize
+    and emitTotalCodeSize.
+*/
+
 void emitter::emitLoopAlignAdjustments()
 {
 #ifdef TARGET_XARCH
 
-    unsigned maxLoopSize = 0, alignmentBoundary = 0;
-#ifndef ADAPTIVE_LOOP_ALIGNMENT
+    unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
+    unsigned maxLoopSize = 0;
     if (emitComp->opts.compJitAlignLoopAdaptive)
-#endif
     {
         // For adaptive, adjust the loop size depending on the alignment boundary
         int maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
-        alignmentBoundary           = DEFAULT_ALIGN_LOOP_BOUNDARY;
         maxLoopSize                 = alignmentBoundary * maxBlocksAllowedForLoop;
     }
-#ifndef ADAPTIVE_LOOP_ALIGNMENT
     else
     {
         // For non-adaptive, just take whatever is supplied using COMPlus_ variables
-        alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
         maxLoopSize       = emitComp->opts.compJitAlignLoopMaxCodeSize;
     }
-#endif
 
-    unsigned alignBytesRemoved = 0, loopSize;
-    bool     skipPadding       = false;
-    
-    /*size_t   dst                     = 0, of = 0;
-    unsigned minBlocksNeededForLoop = 0, nMaxPaddingBytes = 0, nPaddingBytes = 0, loopSize = 0;*/
+    unsigned alignBytesRemoved = 0;
+    unsigned loopSize = 0;
     for (insGroup* ig = emitIGlist; ig != nullptr; ig = ig->igNext)
     {
         ig->igOffs -= alignBytesRemoved;
-        //dst += ig->igSize;
-
-        // Below is not needed because we just care about the igSize
-        // and that gets adjusted in emitJumpDst when we add IGF_UPD_ISZ
-        //// recalculate the size
-        //if ((ig->igFlags & IGF_UPD_ISZ) != 0)
-        //{
-        //    igSize = emitFindOffset(ig, ig->igInsCnt);
-        //    assert(igSize == ig->igSize);
-        //}
-        //else
-        //{
-        //    igSize = ig->igSize;
-        //}
-
-        //if (emitComp->compMethodID == 37683)
-        //{
-        //    unsigned       insNum = ig->igInsCnt;
-        //    instrDesc*     id     = (instrDesc*)ig->igData;
-
-        //    /* Walk the instruction list until all are counted */
-
-        //    while (insNum > 0)
-        //    {
-        //        unsigned currSize = id->idCodeSize();
-
-
-        //        printf("[%04XH] size = %d -- ", of, id->idCodeSize());
-        //        of += currSize;
-
-        //        emitDispIns(id, true, false, false);
-
-        //        castto(id, BYTE*) += emitSizeOfInsDsc(id);
-
-        //        insNum--;
-        //    }
-        //}
-
-        //dst += igSize;
 
         if (!(ig->igFlags & IGF_ALIGN_LOOP))
         {
             continue;
         }
 
-        // TODO: Add logging for Skip/Add?
-        // TODO: I am about to align so (dst -= 15)
-
-       /* if ((dst & (alignmentBoundary - 1)) == 0)
+        if (getLoopSize(ig->igNext, maxLoopSize) > maxLoopSize)
         {
-            skipPadding = true;
-        }
-        else*/
-        {
-            loopSize               = 0;
-            insGroup* loopHeaderIg = ig->igNext;
-            for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
-            {
-                loopSize += igInLoop->igSize;
-                if (igInLoop->igLoopBackEdge == loopHeaderIg || loopSize > maxLoopSize)
-                {
-                    break;
-                }
-            }
-
-           /* minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
-            nMaxPaddingBytes       = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
-            nPaddingBytes          = (-(int)dst) & (alignmentBoundary - 1);*/
-
-            if (loopSize > maxLoopSize)
-            {
-                skipPadding = true;
-            }
-            //else if (nPaddingBytes > nMaxPaddingBytes)
-            //{
-            //    alignmentBoundary = 16;
-            //    nMaxPaddingBytes  = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1);
-            //    nPaddingBytes     = (-(int)dst) & (alignmentBoundary - 1);
-
-            //    if (nPaddingBytes > nMaxPaddingBytes)
-            //    {
-            //        //skipPadding = true;
-            //    }
-            //}
-        }
-
-        //if (!skipPadding && (nPaddingBytes > 0))
-        //{
-        //    size_t extraBytesNotInLoop =
-        //        (32 * minBlocksNeededForLoop) - loopSize;   // Still have it at alignmentboundary=32
-        //    size_t currentOffset = dst % alignmentBoundary; // TODO: Change to & (boundary - 1)
-        //    if (currentOffset <= extraBytesNotInLoop)
-        //    {
-        //        //skipPadding = true;
-
-        //        // TODO: Detect actual no. of padding bytes.
-        //        // TODO: Figure out how to update size of align instructions so they just add padding in emitter.
-        //    }
-        //}
-
-        if (skipPadding)
-        {
-            //dst -= 15;
             ig->igSize -= 15;
-            ig->igFlags |= IGF_UPD_ISZ;
             alignBytesRemoved += 15;
             emitTotalCodeSize -= 15;
 
-            // remove the flag
+            // Update the flags
+            ig->igFlags |= IGF_UPD_ISZ;
             ig->igFlags &= ~IGF_ALIGN_LOOP;
+
+#if DEBUG
+            if (emitComp->verbose)
+            {
+                printf("Removed loop alignment from G_M%03u_IG%02u: 'MaxLoopSize= %d\n", emitComp->compMethodID, ig->igNum, maxLoopSize);
+            }
+#endif
         }
     }
 #endif
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index a609e2e2e02206..8ea65626dd119d 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -1740,7 +1740,9 @@ class emitter
     instrDescJmp* emitJumpList;       // list of local jumps in method
     instrDescJmp* emitJumpLast;       // last of local jumps in method
     void          emitJumpDistBind(); // Bind all the local jumps in method
-    void          emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments
+
+    unsigned getLoopSize(insGroup* loopHeaderIg, unsigned maxLoopSize); // Get the smallest loop size
+    void     emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments
 
     void emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG); // Check for illegal branches between funclets
 
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 46204082801ccb..7ccbf5b5c21a5b 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2669,7 +2669,6 @@ void emitter::emitLoopAlign()
     emitCurIGsize += 15;
 }
 
-#ifndef ADAPTIVE_LOOP_ALIGNMENT
 /*****************************************************************************
  *
  *  The next instruction will be a loop head entry point
@@ -2677,10 +2676,11 @@ void emitter::emitLoopAlign()
  *  the x86 I-cache alignment rule is followed.
  */
 
-void emitter::emitVariableLoopAlign()
+void emitter::emitVariableLoopAlign(unsigned alignmentBoundary)
 {
-    unsigned insAlignCount    = (emitComp->opts.compJitAlignLoopBoundary - 1) / 15;
-    unsigned lastInsAlignSize = (emitComp->opts.compJitAlignLoopBoundary - 1) % 15;
+    unsigned nPaddingBytes    = alignmentBoundary - 1;
+    unsigned insAlignCount    = nPaddingBytes / 15;
+    unsigned lastInsAlignSize = nPaddingBytes % 15;
 
     while (insAlignCount)
     {
@@ -2699,7 +2699,6 @@ void emitter::emitVariableLoopAlign()
         emitCurIGsize += lastInsAlignSize;
     }
 }
-#endif
 
 /*****************************************************************************
  *
@@ -12632,256 +12631,480 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             {
                 // Candidate for loop alignment
                 assert(codeGen->ShouldAlignLoops());
-                
-#ifdef ADAPTIVE_LOOP_ALIGNMENT
-                unsigned alignmentBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY;
-#else
-                unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
-#endif
                 sz = SMALL_IDSC_SIZE;
+                unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
+                int      maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
+                unsigned maxLoopSize             = emitComp->opts.compJitAlignLoopAdaptive
+                                           ? alignmentBoundary * maxBlocksAllowedForLoop
+                                           : emitComp->opts.compJitAlignLoopMaxCodeSize;
+                bool     skipPadding       = false;
 
 #if DEBUG
                 bool displayAlignmentDetails =
                     (emitComp->opts.disAsm /*&& emitComp->opts.disAddr*/) || emitComp->verbose;
 #endif
+               // Check if this IG is already detected to not have alignment
                if ((ig->igFlags & IGF_ALIGN_LOOP) == 0)
                 {
-                    id->idCodeSize(0);
-                    ig->igFlags |= IGF_UPD_ISZ;
+                    /*id->idCodeSize(0);
+                    ig->igFlags |= IGF_UPD_ISZ;*/
+                    skipPadding = true;
+#if DEBUG
                     if (displayAlignmentDetails)
                     {
                         printf("\t\t;; Skip alignment: 'Big loop.' in (%s)\n", emitComp->info.compFullName);
                     }
-                    break;
+#endif
+                    //break;
                 }
 
-                // If already at alignment boundary, no need to emit anything.
+                // Check if the loop is already at alignment boundary
                 if (((size_t)dst & (alignmentBoundary - 1)) == 0)
                 {
-                    id->idCodeSize(0);
-                    ig->igFlags |= IGF_UPD_ISZ;
+                    /*id->idCodeSize(0);
+                    ig->igFlags |= IGF_UPD_ISZ;*/
+                    skipPadding = true;
+#if DEBUG
                     if (displayAlignmentDetails)
                     {
-                        printf("\t\t;; Skip alignment: 'Loop already aligned at boundary.' in (%s)\n",
-                               emitComp->info.compMethodName);
+                        printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n",
+                               alignmentBoundary, emitComp->info.compMethodName);
                     }
-                    break;
+#endif
+                    //break;
                 }
 
-#ifndef ADAPTIVE_LOOP_ALIGNMENT
-                if (emitComp->opts.compJitAlignLoopAdaptive)
-#endif
+                unsigned paddingAdded = 0;
+                if (!skipPadding)
                 {
-                    bool     skipPadding             = false;
-                    int      maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
-                    unsigned maxLoopSize             = DEFAULT_ALIGN_LOOP_BOUNDARY * maxBlocksAllowedForLoop;
-
-                    // calculate the loop size
-                    unsigned  loopSize     = 0;
-                    insGroup* loopHeaderIg = ig->igNext;
-                    for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
+                    // Adaptive padding
+                    if (emitComp->opts.compJitAlignLoopAdaptive)
                     {
-                        loopSize += igInLoop->igSize;
-                        if (igInLoop->igLoopBackEdge == loopHeaderIg || loopSize > maxLoopSize)
+                        // Start to align on 32B boundary with a fallback to 16B boundary
+                        unsigned loopSize = getLoopSize(ig->igNext, maxLoopSize);
+                        int      minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+                        unsigned nMaxPaddingBytes = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
+                        unsigned nPaddingBytes    = (-(int)(size_t)dst) & (alignmentBoundary - 1);
+
+                        // Check if the loop exceed maxSize
+                        if (loopSize > maxLoopSize)
                         {
-                            break;
+                            skipPadding = true;
+                            assert(!"Should never hit maxLoopSize threshold because it should have been predicted "
+                                    "earlier.");
                         }
-                    }
-
-                    // Start to align on 32B boundary with a fallback to 16B boundary
-                    int      minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
-                    unsigned nMaxPaddingBytes       = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
-                    unsigned nPaddingBytes          = (-(int)(size_t)dst) & (alignmentBoundary - 1);
 
-                    if (loopSize > maxLoopSize)
-                    {
-                        skipPadding = true;
-                        assert(!"This should have checked before!");
-#if DEBUG
-                        /*if (displayAlignmentDetails)
+                        // Check if the alignment exceeds maxPadding limit
+                        else if (nPaddingBytes > nMaxPaddingBytes)
                         {
-                            printf("\t\t;; Skip alignment: 'Loopsize= %d, MaxLoopSize= %d, Estimated= %d.' in (%s)\n", loopSize,
-                                   maxLoopSize, ig->loopSize, emitComp->info.compFullName);
-                        }*/
-#endif
-                    }
-                    else if (nPaddingBytes > nMaxPaddingBytes)
-                    {
-                        // Cannot add large padding to align to 32B, so try to align to 16B boundary.
-                        alignmentBoundary = 16;
-                        nMaxPaddingBytes  = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1);
-                        nPaddingBytes     = (-(int)(size_t)dst) & (alignmentBoundary - 1);
+                            // Cannot align to 32B, so try to align to 16B boundary.
+                            alignmentBoundary >>= 1;
+                            nMaxPaddingBytes  = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1);
+                            nPaddingBytes     = (-(int)(size_t)dst) & (alignmentBoundary - 1);
 
-                        if (nPaddingBytes > nMaxPaddingBytes)
-                        {
-                            skipPadding = true;
-#if DEBUG
-                            if (displayAlignmentDetails)
+                            // Check if the loop is already at new alignment boundary
+                            if (nPaddingBytes == 0)
                             {
-                                printf("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
-                                       "AlignmentBoundary= %dB.' in (%s)\n",
-                                       nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary,
-                                       emitComp->info.compFullName);
+                                skipPadding = true;
+#if DEBUG
+                                if (displayAlignmentDetails)
+                                {
+                                    printf("\t\t;; Skip alignment: 'Loop already aligned at 16B boundary.' in (%s)\n",
+                                           emitComp->info.compMethodName);
+                                }
+#endif
                             }
+                            // Check if the alignment exceeds new maxPadding limit
+                            else if (nPaddingBytes > nMaxPaddingBytes)
+                            {
+                                skipPadding = true;
+#if DEBUG
+                                if (displayAlignmentDetails)
+                                {
+                                    printf("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
+                                           "AlignmentBoundary= %dB.' in (%s)\n",
+                                           nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary,
+                                           emitComp->info.compFullName);
+                                }
 #endif
+                            }
                         }
-                    }
 
-                    if (!skipPadding && (nPaddingBytes > 0))
-                    {
-                        size_t extraBytesNotInLoop =
-                            (32 * minBlocksNeededForLoop) - loopSize; // Still have it at alignmentboundary=32
-                        size_t currentOffset = (size_t)dst % alignmentBoundary;
-
-                        // Padding is needed only if loop starts at or after the current offset.
-                        // Otherwise, the loop just fits in minBlocksNeededForLoop and so no alignment needed.
-                        if (currentOffset <= extraBytesNotInLoop)
+                        if (!skipPadding)
                         {
-                            skipPadding = true;
-#if DEBUG
-                            if (displayAlignmentDetails)
+                            // Padding is needed only if loop starts at or after the current offset.
+                            // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+                            size_t extraBytesNotInLoop = (32 * minBlocksNeededForLoop) - loopSize; // For calculation, use 32B chunks
+                            size_t currentOffset = (size_t)dst % alignmentBoundary;
+
+                            // Check if loop starts from offset such that padding can be skipped.
+                            if (currentOffset <= extraBytesNotInLoop)
                             {
-                                printf("\t\t;; Skip alignment: 'Loop already aligned.' in (%s)\n",
-                                       emitComp->info.compMethodName);
-                            }
-#endif
-                        }
-                        else
-                        {
-                            dst = emitOutputNOP(dst, nPaddingBytes);
-                            assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
+                                skipPadding = true;
 #if DEBUG
-                            if (displayAlignmentDetails)
-                            {
-                                printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, MinBlocks= %d, "
-                                       "AlignmentBoundary= %dB.' in (%s)\n",
-                                       nPaddingBytes, loopSize, minBlocksNeededForLoop, alignmentBoundary,
-                                       emitComp->info.compFullName);
+                                if (displayAlignmentDetails)
+                                {
+                                    printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n",
+                                          alignmentBoundary, emitComp->info.compMethodName);
+                                }
+#endif
                             }
+                            else
+                            {
+                                // Perform the padding
+                                paddingAdded = nPaddingBytes;
+                                dst = emitOutputNOP(dst, nPaddingBytes);
+                                assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
+#if DEBUG
+                                if (displayAlignmentDetails)
+                                {
+                                    printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, MinBlocks= %d, "
+                                           "AlignmentBoundary= %dB.' in (%s)\n",
+                                           nPaddingBytes, loopSize, minBlocksNeededForLoop, alignmentBoundary,
+                                           emitComp->info.compFullName);
+                                }
 #endif
+                            }
                         }
-                    }
 
-                    // Update the code size of id
-                    if (skipPadding)
-                    {
-                        id->idCodeSize(0);
-                        ig->igFlags |= IGF_UPD_ISZ;
-                    }
-                    else if (nPaddingBytes != id->idCodeSize())
-                    {
-                        id->idCodeSize(nPaddingBytes);
-                        ig->igFlags |= IGF_UPD_ISZ;
-                    }
-                }
-#ifndef ADAPTIVE_LOOP_ALIGNMENT
-                else
-                {
-                    instrDesc* nextId = id;
-                    castto(nextId, BYTE*) += sz;
-
-                    // If we already know that the code size heuristics won't match,
-                    // do not bother checking it again. Same applies for next instruction
-                    // if that too is INS_align.
-                    if ((id->idCodeSize() == 0))
-                    {
-                        if (nextId->idIns() == INS_align)
-                        {
-                            assert(alignmentBoundary > 16);
-                            nextId->idCodeSize(0);
-                        }
-                        break;
+                        //// Update the code size of id
+                        //if (skipPadding)
+                        //{
+                        //    id->idCodeSize(0);
+                        //    ig->igFlags |= IGF_UPD_ISZ;
+                        //}
+                        //else if (nPaddingBytes != id->idCodeSize())
+                        //{
+                        //    id->idCodeSize(nPaddingBytes);
+                        //    ig->igFlags |= IGF_UPD_ISZ;
+                        //}
                     }
-
-                    unsigned  loopSize     = 0;
-                    insGroup* loopHeaderIg = ig->igNext;
-                    for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
+                    // Non-adaptive padding
+                    else
                     {
-                        loopSize += igInLoop->igSize;
-                        if (igInLoop->igLoopBackEdge == loopHeaderIg ||
-                            loopSize > emitComp->opts.compJitAlignLoopMaxCodeSize)
+                        instrDesc* nextId = id;
+                        castto(nextId, BYTE*) += sz;
+
+                        // For padding > 15 bytes, check if we already performed/skipped
+                        // padding during previous INS_align instruction.
+                        // If yes, skip for current instruction as well as next, if that
+                        // too is INS_align.
+                        if ((id->idCodeSize() == 0))
                         {
+                            if (nextId->idIns() == INS_align)
+                            {
+                                assert(alignmentBoundary > 16);
+                                nextId->idCodeSize(0);
+                            }
                             break;
                         }
-                    }
 
-                    // Only align if it matches the heuristics
-                    if (loopSize <= emitComp->opts.compJitAlignLoopMaxCodeSize)
-                    {
-                        int    minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
-                        size_t extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
-                        size_t currentOffset       = (size_t)dst % alignmentBoundary;
+                        unsigned loopSize            = getLoopSize(ig->igNext, maxLoopSize);
+                        unsigned minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+                        unsigned extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
+                        unsigned currentOffset       = (size_t)dst % alignmentBoundary;
 
+#ifdef DEBUG
                         // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary
                         if (emitComp->opts.compJitAlignLoopForJcc)
                         {
                             // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing?
                             currentOffset++;
                         }
-
-                        // TODO: Revisit nop sequence we emit in case of 31 bytes
+#endif
+                        // Check if the loop exceed maxSize
+                        if (loopSize > maxLoopSize)
+                        {
+                            skipPadding = true;
+                            assert(!"Should never hit maxLoopSize threshold because it should have been predicted "
+                                    "earlier.");
+                        }
 
                         // Padding is needed only if loop starts at or after the current offset.
-                        // Otherwise, the loop just fits in minBlocksNeededForLoop and so no alignment needed.
-                        if (currentOffset <= extraBytesNotInLoop)
+                        // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+                        else if (currentOffset <= extraBytesNotInLoop)
                         {
+                            skipPadding = true;
+#if DEBUG
                             if (displayAlignmentDetails)
                             {
-                                printf("\t\t;; Skip alignment: 'Loop already aligned.' in (%s)\n",
-                                       emitComp->info.compMethodName);
-                            }
-                            if (nextId->idIns() == INS_align)
-                            {
-                                assert(alignmentBoundary > 16);
-                                nextId->idCodeSize(0);
+                                printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n",
+                                       alignmentBoundary, emitComp->info.compMethodName);
                             }
+#endif
                         }
                         else
                         {
-                            size_t nBytes = (-(int)(size_t)dst) & 0x0f;
-                            dst           = emitOutputNOP(dst, nBytes);
+                            // Perform the padding
 
-                            if (nextId->idIns() == INS_align)
-                            {
-                                // If next instruction is also alignment, this better be 32B padding.
-                                assert(alignmentBoundary > 16);
+                            unsigned nPaddingBytes = (-(int)(size_t)dst) & (alignmentBoundary - 1);
+                            unsigned padCounts     = nPaddingBytes / 15;
+                            unsigned lastPadding   = nPaddingBytes % 15;
 
-                                // Align further to 32B boundary, if it is not yet.
-                                if (((size_t)dst & 0x1f) != 0)
-                                {
-                                    dst = emitOutputNOP(dst, 15);
-                                    dst = emitOutputNOP(dst, 1);
-                                    nBytes += 16;
-                                }
+                            while (padCounts)
+                            {
+                                dst = emitOutputNOP(dst, 15);
+                                padCounts--;
                             }
+
+                            dst = emitOutputNOP(dst, lastPadding);
+                            paddingAdded = nPaddingBytes;
+
+                            ///* Insert a pseudo-instruction to ensure that we align
+                            //   the next instruction properly */
+
+                            //if (lastInsAlignSize > 0)
+                            //{
+                            //    instrDesc* id = emitNewInstrSmall(EA_1BYTE);
+                            //    id->idIns(INS_align);
+                            //    id->idCodeSize(lastInsAlignSize);
+                            //    emitCurIGsize += lastInsAlignSize;
+                            //}
+
+                            //// TODO: Revisit nop sequence we emit in case of 31 bytes
+                            //size_t nBytes = (-(int)(size_t)dst) & 0x0f;
+                            //dst           = emitOutputNOP(dst, nBytes);z
+
+                            //if (nextId->idIns() == INS_align)
+                            //{
+                            //    // If next instruction is also alignment, this better be 32B padding.
+                            //    assert(alignmentBoundary > 16);
+
+                            //    // Align further to 32B boundary, if it is not yet.
+                            //    if (((size_t)dst & 0x1f) != 0)
+                            //    {
+                            //        dst = emitOutputNOP(dst, 15);
+                            //        dst = emitOutputNOP(dst, 1);
+                            //        nBytes += 16;
+                            //    }
+                            //}
+                            //paddingAdded = nBytes;
+
+#if DEBUG
                             if (displayAlignmentDetails)
                             {
-                                printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, MinBlocks= %d, "
-                                       "AlignmentBoundary= %dB.' in (%s)\n",
-                                       nBytes, loopSize, minimumBlocksNeeded, alignmentBoundary,
-                                       emitComp->info.compFullName);
+                                printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, AlignmentBoundary= %dB.' in (%s)\n",
+                                       nPaddingBytes, loopSize, alignmentBoundary, emitComp->info.compFullName);
                             }
+#endif
                             // In the end dst should be at alignment boundary
                             assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
                         }
-                    }
-                    else
-                    {
-                        if (displayAlignmentDetails)
-                        {
-                            printf("\t\t;; Skip alignment: 'Loopsize= %d, AllowedMaxSize= %d.' in (%s)\n", loopSize,
-                                   emitComp->opts.compJitAlignLoopMaxCodeSize, emitComp->info.compFullName);
-                        }
-                        // If next instruction is align, skip it so
-                        // we do not check the heuristics again.
-                        if (nextId->idIns() == INS_align)
+
+                        // For padding > 15 bytes, multiple INS_align(15) are emitted.
+                        // If decided to skipPadding, just mark it so for future INS_align
+                        // instructions as well.
+                        if (!skipPadding)
                         {
-                            assert(alignmentBoundary > 16);
-                            nextId->idCodeSize(0);
+                            if (nextId->idIns() == INS_align)
+                            {
+                                assert(alignmentBoundary > 16);
+                                nextId->idCodeSize(0);
+                            }
                         }
                     }
                 }
-#endif
+
+                // If we didn't add as much padding as we thought, update the code size and flag.
+                if (paddingAdded != id->idCodeSize())
+                {
+                    assert(paddingAdded != 0 || skipPadding);
+                    id->idCodeSize(paddingAdded);
+                    ig->igFlags |= IGF_UPD_ISZ;
+                }
+
+//
+//                ///--------------------------------------------------------------
+//                if (!skipPadding && emitComp->opts.compJitAlignLoopAdaptive)
+//                {
+//                    unsigned loopSize                = getLoopSize(ig->igNext, maxLoopSize);
+//
+//                    // Start to align on 32B boundary with a fallback to 16B boundary
+//                    int      minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+//                    unsigned nMaxPaddingBytes       = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
+//                    unsigned nPaddingBytes          = (-(int)(size_t)dst) & (alignmentBoundary - 1);
+//
+//                    if (nPaddingBytes > nMaxPaddingBytes)
+//                    {
+//                        // Cannot add large padding to align to 32B, so try to align to 16B boundary.
+//                        alignmentBoundary = 16;
+//                        nMaxPaddingBytes  = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1);
+//                        nPaddingBytes     = (-(int)(size_t)dst) & (alignmentBoundary - 1);
+//
+//                        if (nPaddingBytes > nMaxPaddingBytes)
+//                        {
+//                            skipPadding = true;
+//#if DEBUG
+//                            if (displayAlignmentDetails)
+//                            {
+//                                printf("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
+//                                       "AlignmentBoundary= %dB.' in (%s)\n",
+//                                       nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary,
+//                                       emitComp->info.compFullName);
+//                            }
+//#endif
+//                        }
+//                    }
+//
+//                    if (!skipPadding && (nPaddingBytes > 0))
+//                    {
+//                        size_t extraBytesNotInLoop =
+//                            (32 * minBlocksNeededForLoop) - loopSize; // Still have it at alignmentboundary=32
+//                        size_t currentOffset = (size_t)dst % alignmentBoundary;
+//
+//                        // Padding is needed only if loop starts at or after the current offset.
+//                        // Otherwise, the loop just fits in minBlocksNeededForLoop and so no alignment needed.
+//                        if (currentOffset <= extraBytesNotInLoop)
+//                        {
+//                            skipPadding = true;
+//#if DEBUG
+//                            if (displayAlignmentDetails)
+//                            {
+//                                printf("\t\t;; Skip alignment: 'Loop already aligned.' in (%s)\n",
+//                                       emitComp->info.compMethodName);
+//                            }
+//#endif
+//                        }
+//                        else
+//                        {
+//                            dst = emitOutputNOP(dst, nPaddingBytes);
+//                            assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
+//#if DEBUG
+//                            if (displayAlignmentDetails)
+//                            {
+//                                printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, MinBlocks= %d, "
+//                                       "AlignmentBoundary= %dB.' in (%s)\n",
+//                                       nPaddingBytes, loopSize, minBlocksNeededForLoop, alignmentBoundary,
+//                                       emitComp->info.compFullName);
+//                            }
+//#endif
+//                        }
+//                    }
+//
+//                    // Update the code size of id
+//                    if (skipPadding)
+//                    {
+//                        id->idCodeSize(0);
+//                        ig->igFlags |= IGF_UPD_ISZ;
+//                    }
+//                    else if (nPaddingBytes != id->idCodeSize())
+//                    {
+//                        id->idCodeSize(nPaddingBytes);
+//                        ig->igFlags |= IGF_UPD_ISZ;
+//                    }
+//                }
+//                else
+//                {
+//                    instrDesc* nextId      = id;
+//                    castto(nextId, BYTE*) += sz;
+//
+//                    // If we already know that the code size heuristics won't match,
+//                    // do not bother checking it again. Same applies for next instruction
+//                    // if that too is INS_align.
+//                    if ((id->idCodeSize() == 0))
+//                    {
+//                        if (nextId->idIns() == INS_align)
+//                        {
+//                            assert(alignmentBoundary > 16);
+//                            nextId->idCodeSize(0);
+//                        }
+//                        break;
+//                    }
+//
+//                    unsigned  loopSize     = 0;
+//                    insGroup* loopHeaderIg = ig->igNext;
+//                    for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
+//                    {
+//                        loopSize += igInLoop->igSize;
+//                        if (igInLoop->igLoopBackEdge == loopHeaderIg || loopSize > maxLoopSize)
+//                        {
+//                            break;
+//                        }
+//                    }
+//
+//                    if (loopSize > maxLoopSize)
+//                    {
+//                        skipPadding = true;
+//                        assert(
+//                            !"Should never hit maxLoopSize threshold because it should have been predicted earlier.");
+//                    }
+//                    else
+//                    {
+//                        unsigned minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+//                        unsigned extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
+//                        unsigned currentOffset       = (size_t)dst % alignmentBoundary;
+//
+//                        // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary
+//                        if (emitComp->opts.compJitAlignLoopForJcc)
+//                        {
+//                            // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing?
+//                            currentOffset++;
+//                        }
+//
+//                        // TODO: Revisit nop sequence we emit in case of 31 bytes
+//
+//                        // Padding is needed only if loop starts at or after the current offset.
+//                        // Otherwise, the loop just fits in minBlocksNeededForLoop and so no alignment needed.
+//                        if (currentOffset <= extraBytesNotInLoop)
+//                        {
+//                            if (displayAlignmentDetails)
+//                            {
+//                                printf("\t\t;; Skip alignment: 'Loop already aligned.' in (%s)\n",
+//                                       emitComp->info.compMethodName);
+//                            }
+//                            if (nextId->idIns() == INS_align)
+//                            {
+//                                assert(alignmentBoundary > 16);
+//                                nextId->idCodeSize(0);
+//                            }
+//                        }
+//                        else
+//                        {
+//                            size_t nBytes = (-(int)(size_t)dst) & 0x0f;
+//                            dst           = emitOutputNOP(dst, nBytes);
+//
+//                            if (nextId->idIns() == INS_align)
+//                            {
+//                                // If next instruction is also alignment, this better be 32B padding.
+//                                assert(alignmentBoundary > 16);
+//
+//                                // Align further to 32B boundary, if it is not yet.
+//                                if (((size_t)dst & 0x1f) != 0)
+//                                {
+//                                    dst = emitOutputNOP(dst, 15);
+//                                    dst = emitOutputNOP(dst, 1);
+//                                    nBytes += 16;
+//                                }
+//                            }
+//                            if (displayAlignmentDetails)
+//                            {
+//                                printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, MinBlocks= %d, "
+//                                       "AlignmentBoundary= %dB.' in (%s)\n",
+//                                       nBytes, loopSize, minimumBlocksNeeded, alignmentBoundary,
+//                                       emitComp->info.compFullName);
+//                            }
+//                            // In the end dst should be at alignment boundary
+//                            assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
+//                        }
+//                    }
+//
+//
+//                    else
+//                    {
+//                        if (displayAlignmentDetails)
+//                        {
+//                            printf("\t\t;; Skip alignment: 'Loopsize= %d, AllowedMaxSize= %d.' in (%s)\n", loopSize,
+//                                   emitComp->opts.compJitAlignLoopMaxCodeSize, emitComp->info.compFullName);
+//                        }
+//                        // If next instruction is align, skip it so
+//                        // we do not check the heuristics again.
+//                        if (nextId->idIns() == INS_align)
+//                        {
+//                            assert(alignmentBoundary > 16);
+//                            nextId->idCodeSize(0);
+//                        }
+//                    }
+//                }
 
                 break;
             }
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 566d48581be3e3..e3e8940c49c20e 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -289,10 +289,7 @@ inline emitAttr emitDecodeScale(unsigned ensz)
 public:
 void emitLoopAlign();
 
-#ifndef ADAPTIVE_LOOP_ALIGNMENT
-void emitVariableLoopAlign();
-
-#endif
+void emitVariableLoopAlign(unsigned alignmentBoundary);
 
 void emitIns(instruction ins);
 
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 3d65ca1a7d32a3..997dc0e2fcfff2 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -47,7 +47,8 @@ CONFIG_INTEGER(JitAlignLoopMinBlockWeight,
                                                     // candidate for alignment.
 CONFIG_INTEGER(JitAlignLoopMaxCodeSize,
                W("JitAlignLoopMaxCodeSize"),
-               0x60) // For non-adaptive alignment, minimum loop size (in bytes) for which alignment will be done.
+               DEFAULT_MAX_LOOPSIZE_FOR_ALIGN) // For non-adaptive alignment, minimum loop size (in bytes) for which
+                                                // alignment will be done.
                      // Defaults to 3 blocks of 32 bytes chunks = 96 bytes.
 CONFIG_INTEGER(JitAlignLoopBoundary,
                W("JitAlignLoopBoundary"),
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 703ca580225117..ff4b1243289d7c 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2546,15 +2546,9 @@ void Compiler::optFindNaturalLoops()
 #if defined(TARGET_XARCH)
         if (codeGen->ShouldAlignLoops())
         {
-#ifdef ADAPTIVE_LOOP_ALIGNMENT
-            unsigned minBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT;
-#else
-            unsigned minBlockWeight = opts.compJitAlignLoopMinBlockWeight;
-#endif
-
             // An innerloop candidate that might need alignment
             if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) &&
-                minBlockWeight <= first->getBBWeight(this))
+                opts.compJitAlignLoopMinBlockWeight <= first->getBBWeight(this))
             {
                 first->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
             }

From 5584d71bd0989b67abddfd1a5b01d6a1c2dd0120 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 11 Nov 2020 16:32:58 -0800
Subject: [PATCH 28/59] Code cleanup

---
 src/coreclr/jit/codegencommon.cpp |   1 +
 src/coreclr/jit/codegenlinear.cpp |   1 -
 src/coreclr/jit/compiler.cpp      |  14 +-
 src/coreclr/jit/emit.cpp          |  22 +-
 src/coreclr/jit/emit.h            |   6 +-
 src/coreclr/jit/emitxarch.cpp     | 334 ++++--------------------------
 src/coreclr/jit/optimizer.cpp     |   7 -
 7 files changed, 64 insertions(+), 321 deletions(-)

diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index 2dcbced91d7518..c7ecda8fb17721 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -2259,6 +2259,7 @@ void CodeGen::genGenerateMachineCode()
     GetEmitter()->emitJumpDistBind();
 
     /* Perform alignment adjustments */
+
     GetEmitter()->emitLoopAlignAdjustments();
 
     /* The code is now complete and final; it should not change after this. */
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index f721d0e7d513eb..ec7a564b06a46d 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -776,7 +776,6 @@ void CodeGen::genCodeForBBlist()
 
             if ((compiler->opts.compJitAlignLoopBoundary > 16) && (!compiler->opts.compJitAlignLoopAdaptive))
             {
-                // TODO: Only do this if we are confident that the loop size doesn't exceed the heuristics threshold
                 GetEmitter()->emitVariableLoopAlign(compiler->opts.compJitAlignLoopBoundary);
             }
             else
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 53046d392791fa..0717c551d39592 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2626,10 +2626,15 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     assert(isPow2(opts.compJitAlignLoopBoundary));
 #else
     opts.compJitAlignLoopAdaptive = true;
-    opts.compJitAlignLoopBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY;
     opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT;
 #endif
 
+    // Adaptive alignment works on 32B boundary
+    if (opts.compJitAlignLoopAdaptive)
+    {
+        opts.compJitAlignLoopBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY;
+    }
+
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
     // We never want to have debugging enabled when regenerating GC encoding patterns
     opts.compDbgCode = false;
@@ -3932,11 +3937,10 @@ void Compiler::compSetOptimizationLevel()
         {
             codeGen->SetAlignLoops(false); // loop alignment not supported for prejitted code
 
-            // The zapper doesn't set JitFlags::JIT_FLAG_ALIGN_LOOPS, and there is
-            // no reason for it to set it as the JIT doesn't currently support loop alignment
-            // for prejitted images. (The JIT doesn't know the final address of the code, hence
+            // The JIT doesn't currently support loop alignment for prejitted images.
+            // (The JIT doesn't know the final address of the code, hence
             // it can't align code based on unknown addresses.)
-            assert(!opts.jitFlags->IsSet(JitFlags::JIT_FLAG_ALIGN_LOOPS));
+            assert(JitConfig.JitAlignLoops() == 0);
         }
         else
         {
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 3951453a9c818f..ca7c6566f0eae3 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -3649,14 +3649,11 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
 
     UNATIVE_OFFSET csz = (UNATIVE_OFFSET)(*dp - curInsAdr);
 
-    if (
-#if defined(TARGET_XARCH)
-        (id->idIns() != INS_align) &&
-#endif
-        (csz != id->idCodeSize()))
+    if (csz != id->idCodeSize())
     {
-        /* It is fatal to under-estimate the instruction size */
-        noway_assert(id->idCodeSize() >= csz);
+        /* It is fatal to under-estimate the instruction size, except it was an alignment instruction */
+        noway_assert(id->idCodeSize() >= csz || (!emitComp->opts.compJitAlignLoopAdaptive && id->idIns() == INS_align &&
+                                                 emitComp->opts.compJitAlignLoopBoundary > 16));
 
 #if DEBUG_EMIT
         if (EMITVERBOSE)
@@ -3665,7 +3662,6 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
         }
 #endif // DEBUG_EMIT
 
-        //assert(id->idCodeSize() == csz);
 
         /* The instruction size estimate wasn't accurate; remember this */
 
@@ -3684,11 +3680,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
 
 #ifdef DEBUG
     /* Make sure the instruction descriptor size also matches our expectations */
-    if (
-#if defined(TARGET_XARCH)
-        (id->idIns() != INS_align) &&
-#endif
-        (is != emitSizeOfInsDsc(id)))
+    if (is != emitSizeOfInsDsc(id))
     {
         printf("%s at %u: Expected size = %u , actual size = %u\n", emitIfName(id->idInsFmt()),
                id->idDebugOnlyInfo()->idNum, is, emitSizeOfInsDsc(id));
@@ -4882,9 +4874,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
     // For x64/x86, align methods that are "optimizations enabled" to 32 byte boundaries if
     // they are larger than 16 bytes and contain a loop.
     //
-    if (
-        // emitComp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_TIER1) &&
-        emitComp->opts.OptimizationEnabled() && !emitComp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT) &&
+    if (emitComp->opts.OptimizationEnabled() && !emitComp->opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT) &&
         (emitTotalHotCodeSize > 16) && emitComp->fgHasLoops)
     {
         allocMemFlag = CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN;
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 8ea65626dd119d..afe465ad546522 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -250,7 +250,7 @@ struct insGroup
     unsigned int   igFuncIdx; // Which function/funclet does this belong to? (Index into Compiler::compFuncInfos array.)
     unsigned short igFlags;   // see IGF_xxx below
     unsigned short igSize;    // # of bytes of code in this group
-    insGroup*      igLoopBackEdge;
+    insGroup*      igLoopBackEdge; // Back-edge that points to the loop head.
 
 #define IGF_GC_VARS 0x0001    // new set of live GC ref variables
 #define IGF_BYREF_REGS 0x0002 // new set of live by-ref registers
@@ -265,7 +265,8 @@ struct insGroup
 #define IGF_PLACEHOLDER 0x0100    // this is a placeholder group, to be filled in later
 #define IGF_EXTEND 0x0200         // this block is conceptually an extension of the previous block
                                   // and the emitter should continue to track GC info as if there was no new block.
-#define IGF_ALIGN_LOOP 0x0400
+#define IGF_ALIGN_LOOP 0x0400     // this group contains alignment instruction at the end because the next IG points
+                                  // to inner loop that needs alignment.
 
 // Mask of IGF_* flags that should be propagated to new blocks when they are created.
 // This allows prologs and epilogs to be any number of IGs, but still be
@@ -309,7 +310,6 @@ struct insGroup
 #endif
 
     unsigned char igInsCnt; // # of instructions  in this group
-                            // TODO: Add loopBackEdge?
 
 #endif // REGMASK_BITS
 
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 7ccbf5b5c21a5b..35dda98594879c 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -12631,23 +12631,18 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             {
                 // Candidate for loop alignment
                 assert(codeGen->ShouldAlignLoops());
-                sz = SMALL_IDSC_SIZE;
+
                 unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
-                int      maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
-                unsigned maxLoopSize             = emitComp->opts.compJitAlignLoopAdaptive
-                                           ? alignmentBoundary * maxBlocksAllowedForLoop
-                                           : emitComp->opts.compJitAlignLoopMaxCodeSize;
                 bool     skipPadding       = false;
+                sz                         = SMALL_IDSC_SIZE;
 
 #if DEBUG
                 bool displayAlignmentDetails =
                     (emitComp->opts.disAsm /*&& emitComp->opts.disAddr*/) || emitComp->verbose;
 #endif
-               // Check if this IG is already detected to not have alignment
+               // Check if we already detected that this IG do not need alignment
                if ((ig->igFlags & IGF_ALIGN_LOOP) == 0)
                 {
-                    /*id->idCodeSize(0);
-                    ig->igFlags |= IGF_UPD_ISZ;*/
                     skipPadding = true;
 #if DEBUG
                     if (displayAlignmentDetails)
@@ -12655,14 +12650,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                         printf("\t\t;; Skip alignment: 'Big loop.' in (%s)\n", emitComp->info.compFullName);
                     }
 #endif
-                    //break;
                 }
 
                 // Check if the loop is already at alignment boundary
                 if (((size_t)dst & (alignmentBoundary - 1)) == 0)
                 {
-                    /*id->idCodeSize(0);
-                    ig->igFlags |= IGF_UPD_ISZ;*/
                     skipPadding = true;
 #if DEBUG
                     if (displayAlignmentDetails)
@@ -12671,18 +12663,19 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                                alignmentBoundary, emitComp->info.compMethodName);
                     }
 #endif
-                    //break;
                 }
 
-                unsigned paddingAdded = 0;
+                unsigned paddingToAdd = 0;
                 if (!skipPadding)
                 {
                     // Adaptive padding
                     if (emitComp->opts.compJitAlignLoopAdaptive)
                     {
                         // Start to align on 32B boundary with a fallback to 16B boundary
-                        unsigned loopSize = getLoopSize(ig->igNext, maxLoopSize);
-                        int      minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+                        int      maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
+                        unsigned maxLoopSize             = alignmentBoundary * maxBlocksAllowedForLoop;
+                        unsigned loopSize                = getLoopSize(ig->igNext, maxLoopSize);
+                        unsigned minBlocksNeededForLoop  = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
                         unsigned nMaxPaddingBytes = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
                         unsigned nPaddingBytes    = (-(int)(size_t)dst) & (alignmentBoundary - 1);
 
@@ -12734,7 +12727,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                         {
                             // Padding is needed only if loop starts at or after the current offset.
                             // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
-                            size_t extraBytesNotInLoop = (32 * minBlocksNeededForLoop) - loopSize; // For calculation, use 32B chunks
+                            size_t extraBytesNotInLoop = (size_t)(32 * minBlocksNeededForLoop) - loopSize; // For calculation, use 32B chunks
                             size_t currentOffset = (size_t)dst % alignmentBoundary;
 
                             // Check if loop starts from offset such that padding can be skipped.
@@ -12752,32 +12745,9 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                             else
                             {
                                 // Perform the padding
-                                paddingAdded = nPaddingBytes;
-                                dst = emitOutputNOP(dst, nPaddingBytes);
-                                assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
-#if DEBUG
-                                if (displayAlignmentDetails)
-                                {
-                                    printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, MinBlocks= %d, "
-                                           "AlignmentBoundary= %dB.' in (%s)\n",
-                                           nPaddingBytes, loopSize, minBlocksNeededForLoop, alignmentBoundary,
-                                           emitComp->info.compFullName);
-                                }
-#endif
+                                paddingToAdd = nPaddingBytes;
                             }
                         }
-
-                        //// Update the code size of id
-                        //if (skipPadding)
-                        //{
-                        //    id->idCodeSize(0);
-                        //    ig->igFlags |= IGF_UPD_ISZ;
-                        //}
-                        //else if (nPaddingBytes != id->idCodeSize())
-                        //{
-                        //    id->idCodeSize(nPaddingBytes);
-                        //    ig->igFlags |= IGF_UPD_ISZ;
-                        //}
                     }
                     // Non-adaptive padding
                     else
@@ -12799,6 +12769,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                             break;
                         }
 
+                        unsigned maxLoopSize         = emitComp->opts.compJitAlignLoopMaxCodeSize;
                         unsigned loopSize            = getLoopSize(ig->igNext, maxLoopSize);
                         unsigned minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
                         unsigned extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
@@ -12836,59 +12807,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                         else
                         {
                             // Perform the padding
-
-                            unsigned nPaddingBytes = (-(int)(size_t)dst) & (alignmentBoundary - 1);
-                            unsigned padCounts     = nPaddingBytes / 15;
-                            unsigned lastPadding   = nPaddingBytes % 15;
-
-                            while (padCounts)
-                            {
-                                dst = emitOutputNOP(dst, 15);
-                                padCounts--;
-                            }
-
-                            dst = emitOutputNOP(dst, lastPadding);
-                            paddingAdded = nPaddingBytes;
-
-                            ///* Insert a pseudo-instruction to ensure that we align
-                            //   the next instruction properly */
-
-                            //if (lastInsAlignSize > 0)
-                            //{
-                            //    instrDesc* id = emitNewInstrSmall(EA_1BYTE);
-                            //    id->idIns(INS_align);
-                            //    id->idCodeSize(lastInsAlignSize);
-                            //    emitCurIGsize += lastInsAlignSize;
-                            //}
-
-                            //// TODO: Revisit nop sequence we emit in case of 31 bytes
-                            //size_t nBytes = (-(int)(size_t)dst) & 0x0f;
-                            //dst           = emitOutputNOP(dst, nBytes);z
-
-                            //if (nextId->idIns() == INS_align)
-                            //{
-                            //    // If next instruction is also alignment, this better be 32B padding.
-                            //    assert(alignmentBoundary > 16);
-
-                            //    // Align further to 32B boundary, if it is not yet.
-                            //    if (((size_t)dst & 0x1f) != 0)
-                            //    {
-                            //        dst = emitOutputNOP(dst, 15);
-                            //        dst = emitOutputNOP(dst, 1);
-                            //        nBytes += 16;
-                            //    }
-                            //}
-                            //paddingAdded = nBytes;
-
-#if DEBUG
-                            if (displayAlignmentDetails)
-                            {
-                                printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, AlignmentBoundary= %dB.' in (%s)\n",
-                                       nPaddingBytes, loopSize, alignmentBoundary, emitComp->info.compFullName);
-                            }
-#endif
-                            // In the end dst should be at alignment boundary
-                            assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
+                            paddingToAdd = (-(int)(size_t)dst) & (alignmentBoundary - 1);
                         }
 
                         // For padding > 15 bytes, multiple INS_align(15) are emitted.
@@ -12905,207 +12824,44 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                     }
                 }
 
+                // Add the padding, if needed.
+                if (paddingToAdd > 0)
+                {
+                    assert(!skipPadding);
+                    assert(((size_t)dst & (alignmentBoundary - 1)) != 0);
+
+                    unsigned padCounts   = paddingToAdd / 15;
+                    unsigned lastPadding = paddingToAdd % 15;
+
+                    //TODO: For padding > 15 bytes, evaluate the sequence of NOPs emitted
+                    //      and see if they can be improved.
+                    while (padCounts)
+                    {
+                        dst = emitOutputNOP(dst, 15);
+                        padCounts--;
+                    }
+
+                    dst = emitOutputNOP(dst, lastPadding);
+
+#if DEBUG
+                    if (displayAlignmentDetails)
+                    {
+                        printf("\t\t;; Add alignment: 'Padding= %d, AlignmentBoundary= %dB.' in (%s)\n",
+                               paddingToAdd, alignmentBoundary, emitComp->info.compFullName);
+                    }
+#endif
+                    // In the end dst should be at alignment boundary
+                    assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
+                }
+
                 // If we didn't add as much padding as we thought, update the code size and flag.
-                if (paddingAdded != id->idCodeSize())
+                if (paddingToAdd != id->idCodeSize())
                 {
-                    assert(paddingAdded != 0 || skipPadding);
-                    id->idCodeSize(paddingAdded);
+                    assert(paddingToAdd != 0 || skipPadding);
+                    id->idCodeSize(paddingToAdd);
                     ig->igFlags |= IGF_UPD_ISZ;
                 }
 
-//
-//                ///--------------------------------------------------------------
-//                if (!skipPadding && emitComp->opts.compJitAlignLoopAdaptive)
-//                {
-//                    unsigned loopSize                = getLoopSize(ig->igNext, maxLoopSize);
-//
-//                    // Start to align on 32B boundary with a fallback to 16B boundary
-//                    int      minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
-//                    unsigned nMaxPaddingBytes       = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
-//                    unsigned nPaddingBytes          = (-(int)(size_t)dst) & (alignmentBoundary - 1);
-//
-//                    if (nPaddingBytes > nMaxPaddingBytes)
-//                    {
-//                        // Cannot add large padding to align to 32B, so try to align to 16B boundary.
-//                        alignmentBoundary = 16;
-//                        nMaxPaddingBytes  = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1);
-//                        nPaddingBytes     = (-(int)(size_t)dst) & (alignmentBoundary - 1);
-//
-//                        if (nPaddingBytes > nMaxPaddingBytes)
-//                        {
-//                            skipPadding = true;
-//#if DEBUG
-//                            if (displayAlignmentDetails)
-//                            {
-//                                printf("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
-//                                       "AlignmentBoundary= %dB.' in (%s)\n",
-//                                       nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary,
-//                                       emitComp->info.compFullName);
-//                            }
-//#endif
-//                        }
-//                    }
-//
-//                    if (!skipPadding && (nPaddingBytes > 0))
-//                    {
-//                        size_t extraBytesNotInLoop =
-//                            (32 * minBlocksNeededForLoop) - loopSize; // Still have it at alignmentboundary=32
-//                        size_t currentOffset = (size_t)dst % alignmentBoundary;
-//
-//                        // Padding is needed only if loop starts at or after the current offset.
-//                        // Otherwise, the loop just fits in minBlocksNeededForLoop and so no alignment needed.
-//                        if (currentOffset <= extraBytesNotInLoop)
-//                        {
-//                            skipPadding = true;
-//#if DEBUG
-//                            if (displayAlignmentDetails)
-//                            {
-//                                printf("\t\t;; Skip alignment: 'Loop already aligned.' in (%s)\n",
-//                                       emitComp->info.compMethodName);
-//                            }
-//#endif
-//                        }
-//                        else
-//                        {
-//                            dst = emitOutputNOP(dst, nPaddingBytes);
-//                            assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
-//#if DEBUG
-//                            if (displayAlignmentDetails)
-//                            {
-//                                printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, MinBlocks= %d, "
-//                                       "AlignmentBoundary= %dB.' in (%s)\n",
-//                                       nPaddingBytes, loopSize, minBlocksNeededForLoop, alignmentBoundary,
-//                                       emitComp->info.compFullName);
-//                            }
-//#endif
-//                        }
-//                    }
-//
-//                    // Update the code size of id
-//                    if (skipPadding)
-//                    {
-//                        id->idCodeSize(0);
-//                        ig->igFlags |= IGF_UPD_ISZ;
-//                    }
-//                    else if (nPaddingBytes != id->idCodeSize())
-//                    {
-//                        id->idCodeSize(nPaddingBytes);
-//                        ig->igFlags |= IGF_UPD_ISZ;
-//                    }
-//                }
-//                else
-//                {
-//                    instrDesc* nextId      = id;
-//                    castto(nextId, BYTE*) += sz;
-//
-//                    // If we already know that the code size heuristics won't match,
-//                    // do not bother checking it again. Same applies for next instruction
-//                    // if that too is INS_align.
-//                    if ((id->idCodeSize() == 0))
-//                    {
-//                        if (nextId->idIns() == INS_align)
-//                        {
-//                            assert(alignmentBoundary > 16);
-//                            nextId->idCodeSize(0);
-//                        }
-//                        break;
-//                    }
-//
-//                    unsigned  loopSize     = 0;
-//                    insGroup* loopHeaderIg = ig->igNext;
-//                    for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
-//                    {
-//                        loopSize += igInLoop->igSize;
-//                        if (igInLoop->igLoopBackEdge == loopHeaderIg || loopSize > maxLoopSize)
-//                        {
-//                            break;
-//                        }
-//                    }
-//
-//                    if (loopSize > maxLoopSize)
-//                    {
-//                        skipPadding = true;
-//                        assert(
-//                            !"Should never hit maxLoopSize threshold because it should have been predicted earlier.");
-//                    }
-//                    else
-//                    {
-//                        unsigned minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
-//                        unsigned extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
-//                        unsigned currentOffset       = (size_t)dst % alignmentBoundary;
-//
-//                        // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary
-//                        if (emitComp->opts.compJitAlignLoopForJcc)
-//                        {
-//                            // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing?
-//                            currentOffset++;
-//                        }
-//
-//                        // TODO: Revisit nop sequence we emit in case of 31 bytes
-//
-//                        // Padding is needed only if loop starts at or after the current offset.
-//                        // Otherwise, the loop just fits in minBlocksNeededForLoop and so no alignment needed.
-//                        if (currentOffset <= extraBytesNotInLoop)
-//                        {
-//                            if (displayAlignmentDetails)
-//                            {
-//                                printf("\t\t;; Skip alignment: 'Loop already aligned.' in (%s)\n",
-//                                       emitComp->info.compMethodName);
-//                            }
-//                            if (nextId->idIns() == INS_align)
-//                            {
-//                                assert(alignmentBoundary > 16);
-//                                nextId->idCodeSize(0);
-//                            }
-//                        }
-//                        else
-//                        {
-//                            size_t nBytes = (-(int)(size_t)dst) & 0x0f;
-//                            dst           = emitOutputNOP(dst, nBytes);
-//
-//                            if (nextId->idIns() == INS_align)
-//                            {
-//                                // If next instruction is also alignment, this better be 32B padding.
-//                                assert(alignmentBoundary > 16);
-//
-//                                // Align further to 32B boundary, if it is not yet.
-//                                if (((size_t)dst & 0x1f) != 0)
-//                                {
-//                                    dst = emitOutputNOP(dst, 15);
-//                                    dst = emitOutputNOP(dst, 1);
-//                                    nBytes += 16;
-//                                }
-//                            }
-//                            if (displayAlignmentDetails)
-//                            {
-//                                printf("\t\t;; Add alignment: 'Padding= %d, LoopSize= %d, MinBlocks= %d, "
-//                                       "AlignmentBoundary= %dB.' in (%s)\n",
-//                                       nBytes, loopSize, minimumBlocksNeeded, alignmentBoundary,
-//                                       emitComp->info.compFullName);
-//                            }
-//                            // In the end dst should be at alignment boundary
-//                            assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
-//                        }
-//                    }
-//
-//
-//                    else
-//                    {
-//                        if (displayAlignmentDetails)
-//                        {
-//                            printf("\t\t;; Skip alignment: 'Loopsize= %d, AllowedMaxSize= %d.' in (%s)\n", loopSize,
-//                                   emitComp->opts.compJitAlignLoopMaxCodeSize, emitComp->info.compFullName);
-//                        }
-//                        // If next instruction is align, skip it so
-//                        // we do not check the heuristics again.
-//                        if (nextId->idIns() == INS_align)
-//                        {
-//                            assert(alignmentBoundary > 16);
-//                            nextId->idCodeSize(0);
-//                        }
-//                    }
-//                }
-
                 break;
             }
 
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index ff4b1243289d7c..87fb5982827c56 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2910,15 +2910,8 @@ bool Compiler::optCanonicalizeLoop(unsigned char loopInd)
     {
         optLoopTable[loopInd].lpEntry = newT;
     }
-    // assert((optLoopTable[loopInd].lpFirst->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP) == 0);
     optLoopTable[loopInd].lpTop   = newT;
     optLoopTable[loopInd].lpFirst = newT;
-    // Something to investigate
-    /*if ((optLoopTable[loopInd].lpFirst->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP) != 0)
-    {
-        newT->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
-    }
-    newT->bbFlags |= (optLoopTable[loopInd].lpFirst->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP);*/
 
     newT->bbNatLoopNum = loopInd;
 

From f66ee24e1eebf6a7113e9ee09dfba274175601b1 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 11 Nov 2020 17:51:41 -0800
Subject: [PATCH 29/59] minor fixes

---
 src/coreclr/jit/compiler.cpp | 5 ++---
 src/coreclr/jit/compiler.h   | 2 +-
 src/coreclr/jit/emit.cpp     | 9 +++++++--
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 0717c551d39592..575ad1db9b46b8 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2617,13 +2617,11 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
 
 #ifdef DEBUG
     opts.compJitAlignLoopAdaptive       = JitConfig.JitAlignLoopAdaptive() == 1;
-    opts.compJitAlignLoopBoundary       = ReinterpretHexAsDecimal(JitConfig.JitAlignLoopBoundary());
+    opts.compJitAlignLoopBoundary       = JitConfig.JitAlignLoopBoundary();
     opts.compJitAlignLoopMinBlockWeight = JitConfig.JitAlignLoopMinBlockWeight();
 
     opts.compJitAlignLoopForJcc         = JitConfig.JitAlignLoopForJcc() == 1;
     opts.compJitAlignLoopMaxCodeSize    = JitConfig.JitAlignLoopMaxCodeSize();
-
-    assert(isPow2(opts.compJitAlignLoopBoundary));
 #else
     opts.compJitAlignLoopAdaptive = true;
     opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT;
@@ -2634,6 +2632,7 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     {
         opts.compJitAlignLoopBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY;
     }
+    assert(isPow2(opts.compJitAlignLoopBoundary));
 
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
     // We never want to have debugging enabled when regenerating GC encoding patterns
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 261057485b87e6..a5847e6b151b78 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -9037,7 +9037,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #endif
 
 #define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 10
-#define DEFAULT_ALIGN_LOOP_BOUNDARY 32
+#define DEFAULT_ALIGN_LOOP_BOUNDARY 0x20
 #define DEFAULT_MAX_LOOPSIZE_FOR_ALIGN DEFAULT_ALIGN_LOOP_BOUNDARY * 3
 
 #ifdef DEBUG
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index ca7c6566f0eae3..152933a2de2147 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -3652,8 +3652,13 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
     if (csz != id->idCodeSize())
     {
         /* It is fatal to under-estimate the instruction size, except it was an alignment instruction */
-        noway_assert(id->idCodeSize() >= csz || (!emitComp->opts.compJitAlignLoopAdaptive && id->idIns() == INS_align &&
-                                                 emitComp->opts.compJitAlignLoopBoundary > 16));
+        bool validCodeSize = id->idCodeSize() >= csz;
+
+#if defined(TARGET_XARCH)
+        validCodeSize |= (!emitComp->opts.compJitAlignLoopAdaptive && id->idIns() == INS_align &&
+                          emitComp->opts.compJitAlignLoopBoundary > 16);
+#endif
+        noway_assert(validCodeSize);
 
 #if DEBUG_EMIT
         if (EMITVERBOSE)

From f2f935d7cebbd63c074049705af9281c81d29074 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 12 Nov 2020 11:33:19 -0800
Subject: [PATCH 30/59] Fix issues: - Make sure all `align` instructions for
 non-adaptive fall under same IG - Convert some variables to `unsigned short`
 - Fixed the maxPadding amount for adaptive alignment calculation

---
 src/coreclr/jit/compiler.cpp  |  6 +++---
 src/coreclr/jit/compiler.h    |  6 +++---
 src/coreclr/jit/emit.cpp      | 16 ++++++++++------
 src/coreclr/jit/emitxarch.cpp | 18 +++++++++++++-----
 src/coreclr/jit/emitxarch.h   |  2 +-
 5 files changed, 30 insertions(+), 18 deletions(-)

diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 575ad1db9b46b8..1e60411bcc1b11 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2617,11 +2617,11 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
 
 #ifdef DEBUG
     opts.compJitAlignLoopAdaptive       = JitConfig.JitAlignLoopAdaptive() == 1;
-    opts.compJitAlignLoopBoundary       = JitConfig.JitAlignLoopBoundary();
-    opts.compJitAlignLoopMinBlockWeight = JitConfig.JitAlignLoopMinBlockWeight();
+    opts.compJitAlignLoopBoundary       = (unsigned short) JitConfig.JitAlignLoopBoundary();
+    opts.compJitAlignLoopMinBlockWeight = (unsigned short) JitConfig.JitAlignLoopMinBlockWeight();
 
     opts.compJitAlignLoopForJcc         = JitConfig.JitAlignLoopForJcc() == 1;
-    opts.compJitAlignLoopMaxCodeSize    = JitConfig.JitAlignLoopMaxCodeSize();
+    opts.compJitAlignLoopMaxCodeSize    = (unsigned short) JitConfig.JitAlignLoopMaxCodeSize();
 #else
     opts.compJitAlignLoopAdaptive = true;
     opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT;
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index a5847e6b151b78..acd2d101354fbe 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -9045,13 +9045,13 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
         bool compJitAlignLoopForJcc;             // If set, for non-adaptive alignment, ensure loop jmps are not on or
                                                  // cross alignment boundary.
 #endif
-        unsigned compJitAlignLoopMaxCodeSize; // For non-adaptive alignment, minimum loop size (in bytes) for which
+        unsigned short compJitAlignLoopMaxCodeSize; // For non-adaptive alignment, minimum loop size (in bytes) for which
                                               // alignment will be done.
 
-        unsigned compJitAlignLoopMinBlockWeight; // Minimum weight needed for the first block of a loop to make it a
+        unsigned short compJitAlignLoopMinBlockWeight; // Minimum weight needed for the first block of a loop to make it a
                                                  // candidate for alignment.
 
-        unsigned compJitAlignLoopBoundary; // For non-adaptive alignment, address boundary (power of 2) at which
+        unsigned short compJitAlignLoopBoundary; // For non-adaptive alignment, address boundary (power of 2) at which
                                            // loop alignment should be done. By default, 32B.
 
         bool compJitAlignLoopAdaptive; // If set, perform adaptive loop alignment that limits number of padding
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 152933a2de2147..b48519a3d39be3 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -4517,18 +4517,20 @@ void emitter::emitLoopAlignAdjustments()
 {
 #ifdef TARGET_XARCH
 
-    unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
-    unsigned maxLoopSize = 0;
+    unsigned short maxPaddingAdded, alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
+    unsigned       maxLoopSize = 0;
     if (emitComp->opts.compJitAlignLoopAdaptive)
     {
         // For adaptive, adjust the loop size depending on the alignment boundary
-        int maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
+        int maxBlocksAllowedForLoop = genLog2((unsigned)alignmentBoundary) - 1;
         maxLoopSize                 = alignmentBoundary * maxBlocksAllowedForLoop;
+        maxPaddingAdded             = (alignmentBoundary >> 1) - 1;
     }
     else
     {
         // For non-adaptive, just take whatever is supplied using COMPlus_ variables
         maxLoopSize       = emitComp->opts.compJitAlignLoopMaxCodeSize;
+        maxPaddingAdded = alignmentBoundary - 1;
     }
 
     unsigned alignBytesRemoved = 0;
@@ -4544,9 +4546,11 @@ void emitter::emitLoopAlignAdjustments()
 
         if (getLoopSize(ig->igNext, maxLoopSize) > maxLoopSize)
         {
-            ig->igSize -= 15;
-            alignBytesRemoved += 15;
-            emitTotalCodeSize -= 15;
+            assert(ig->igSize >= maxPaddingAdded);
+
+            ig->igSize -= maxPaddingAdded;
+            alignBytesRemoved += maxPaddingAdded;
+            emitTotalCodeSize -= maxPaddingAdded;
 
             // Update the flags
             ig->igFlags |= IGF_UPD_ISZ;
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 35dda98594879c..a33a1ba6e6c2c2 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2676,11 +2676,19 @@ void emitter::emitLoopAlign()
  *  the x86 I-cache alignment rule is followed.
  */
 
-void emitter::emitVariableLoopAlign(unsigned alignmentBoundary)
+void emitter::emitVariableLoopAlign(unsigned short alignmentBoundary)
 {
-    unsigned nPaddingBytes    = alignmentBoundary - 1;
-    unsigned insAlignCount    = nPaddingBytes / 15;
-    unsigned lastInsAlignSize = nPaddingBytes % 15;
+    unsigned short nPaddingBytes    = alignmentBoundary - 1;
+    unsigned short nAlignInstr      = (nPaddingBytes + (15 - 1)) / 15;
+    unsigned short instrDescSize    = nAlignInstr * SMALL_IDSC_SIZE;
+    unsigned short insAlignCount    = nPaddingBytes / 15;
+    unsigned short lastInsAlignSize = nPaddingBytes % 15;
+
+    // Ensure that all align instructions fall in same IG.
+    if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp)
+    {
+        emitForceNewIG = true;
+    }
 
     while (insAlignCount)
     {
@@ -12769,7 +12777,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                             break;
                         }
 
-                        unsigned maxLoopSize         = emitComp->opts.compJitAlignLoopMaxCodeSize;
+                        unsigned short maxLoopSize   = emitComp->opts.compJitAlignLoopMaxCodeSize;
                         unsigned loopSize            = getLoopSize(ig->igNext, maxLoopSize);
                         unsigned minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
                         unsigned extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index e3e8940c49c20e..1eafe7d90a4c60 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -289,7 +289,7 @@ inline emitAttr emitDecodeScale(unsigned ensz)
 public:
 void emitLoopAlign();
 
-void emitVariableLoopAlign(unsigned alignmentBoundary);
+void emitVariableLoopAlign(unsigned short alignmentBoundary);
 
 void emitIns(instruction ins);
 

From 99ea31f48a51b599f10c589b278202dd74cfeed4 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 12 Nov 2020 13:11:14 -0800
Subject: [PATCH 31/59] Other fixes

---
 src/coreclr/jit/codegenlinear.cpp | 2 ++
 src/coreclr/jit/compiler.cpp      | 5 ++---
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index ec7a564b06a46d..cdf2f25d3e2284 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -787,11 +787,13 @@ void CodeGen::genCodeForBBlist()
             // all IGs that follows this IG and participate in a loop.
             GetEmitter()->emitCurIG->igFlags |= IGF_ALIGN_LOOP;
 
+#if defined(DEBUG)
             if (verbose)
             {
                 printf("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u to align loop header block.\n" FMT_BB,
                        compiler->opts.compJitAlignLoopBoundary, compiler->compMethodID, GetEmitter()->emitCurIG->igNum);
             }
+#endif
         }
 #endif
 
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 1e60411bcc1b11..585fd17a222a1e 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -3934,12 +3934,11 @@ void Compiler::compSetOptimizationLevel()
 
         if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_RELOC))
         {
-            codeGen->SetAlignLoops(false); // loop alignment not supported for prejitted code
-
             // The JIT doesn't currently support loop alignment for prejitted images.
             // (The JIT doesn't know the final address of the code, hence
             // it can't align code based on unknown addresses.)
-            assert(JitConfig.JitAlignLoops() == 0);
+
+            codeGen->SetAlignLoops(false); // loop alignment not supported for prejitted code
         }
         else
         {

From 8f64963b9944d92a689c6fa30c347e3736b5dc44 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 3 Dec 2020 17:21:22 -0800
Subject: [PATCH 32/59] Remove align_loops flag from coreclr

---
 src/coreclr/inc/clrconfigvalues.h                     | 1 -
 src/coreclr/inc/corjitflags.h                         | 1 -
 src/coreclr/jit/jitee.h                               | 2 --
 src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs | 1 -
 src/coreclr/vm/eeconfig.cpp                           | 2 --
 src/coreclr/vm/eeconfig.h                             | 2 --
 src/coreclr/vm/jitinterface.cpp                       | 2 --
 7 files changed, 11 deletions(-)

diff --git a/src/coreclr/inc/clrconfigvalues.h b/src/coreclr/inc/clrconfigvalues.h
index fb0d859f8db8b7..6ddd274ac9fce8 100644
--- a/src/coreclr/inc/clrconfigvalues.h
+++ b/src/coreclr/inc/clrconfigvalues.h
@@ -302,7 +302,6 @@ RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_UseIBCFile, W("UseIBCFile"), 0, "", CLRConf
 ///
 /// JIT
 ///
-RETAIL_CONFIG_DWORD_INFO_DIRECT_ACCESS(UNSUPPORTED_JitAlignLoops, W("JitAlignLoops"), "Aligns loop targets to 8 byte boundaries")
 CONFIG_DWORD_INFO_EX(INTERNAL_JitBreakEmit, W("JitBreakEmit"), (DWORD)-1, "", CLRConfig::EEConfig_default)
 CONFIG_DWORD_INFO_DIRECT_ACCESS(INTERNAL_JitDebuggable, W("JitDebuggable"), "")
 #if !defined(DEBUG) && !defined(_DEBUG)
diff --git a/src/coreclr/inc/corjitflags.h b/src/coreclr/inc/corjitflags.h
index 83cbc20be8863a..0c6eacaee02e53 100644
--- a/src/coreclr/inc/corjitflags.h
+++ b/src/coreclr/inc/corjitflags.h
@@ -79,7 +79,6 @@ class CORJIT_FLAGS
         CORJIT_FLAG_BBINSTR                 = 29, // Collect basic block profile information
         CORJIT_FLAG_BBOPT                   = 30, // Optimize method based on profile information
         CORJIT_FLAG_FRAMED                  = 31, // All methods have an EBP frame
-        CORJIT_FLAG_ALIGN_LOOPS             = 32, // add NOPs before loops to align them at 16 byte boundaries
         CORJIT_FLAG_PUBLISH_SECRET_PARAM    = 33, // JIT must place stub secret param into local 0.  (used by IL stubs)
         CORJIT_FLAG_UNUSED12                = 34,
         CORJIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background
diff --git a/src/coreclr/jit/jitee.h b/src/coreclr/jit/jitee.h
index 298536138b2e1a..964fd0f0c70898 100644
--- a/src/coreclr/jit/jitee.h
+++ b/src/coreclr/jit/jitee.h
@@ -63,7 +63,6 @@ class JitFlags
         JIT_FLAG_BBINSTR                 = 29, // Collect basic block profile information
         JIT_FLAG_BBOPT                   = 30, // Optimize method based on profile information
         JIT_FLAG_FRAMED                  = 31, // All methods have an EBP frame
-        JIT_FLAG_ALIGN_LOOPS             = 32, // add NOPs before loops to align them at 16 byte boundaries
         JIT_FLAG_PUBLISH_SECRET_PARAM    = 33, // JIT must place stub secret param into local 0.  (used by IL stubs)
         JIT_FLAG_UNUSED12                = 34,
         JIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background
@@ -201,7 +200,6 @@ class JitFlags
         FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_BBINSTR, JIT_FLAG_BBINSTR);
         FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_BBOPT, JIT_FLAG_BBOPT);
         FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_FRAMED, JIT_FLAG_FRAMED);
-        FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_ALIGN_LOOPS, JIT_FLAG_ALIGN_LOOPS);
         FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_PUBLISH_SECRET_PARAM, JIT_FLAG_PUBLISH_SECRET_PARAM);
         FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_SAMPLING_JIT_BACKGROUND, JIT_FLAG_SAMPLING_JIT_BACKGROUND);
         FLAGS_EQUAL(CORJIT_FLAGS::CORJIT_FLAG_USE_PINVOKE_HELPERS, JIT_FLAG_USE_PINVOKE_HELPERS);
diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs
index 79768f5fbdb9eb..7b3a08b5e47dba 100644
--- a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs
+++ b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs
@@ -1307,7 +1307,6 @@ public enum CorJitFlag : uint
         CORJIT_FLAG_BBINSTR = 29, // Collect basic block profile information
         CORJIT_FLAG_BBOPT = 30, // Optimize method based on profile information
         CORJIT_FLAG_FRAMED = 31, // All methods have an EBP frame
-        CORJIT_FLAG_ALIGN_LOOPS = 32, // add NOPs before loops to align them at 16 byte boundaries
         CORJIT_FLAG_PUBLISH_SECRET_PARAM = 33, // JIT must place stub secret param into local 0.  (used by IL stubs)
         CORJIT_FLAG_UNUSED8 = 34,
         CORJIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background
diff --git a/src/coreclr/vm/eeconfig.cpp b/src/coreclr/vm/eeconfig.cpp
index 389e4024e8c3bf..c1336060d21b71 100644
--- a/src/coreclr/vm/eeconfig.cpp
+++ b/src/coreclr/vm/eeconfig.cpp
@@ -118,7 +118,6 @@ HRESULT EEConfig::Init()
 
     iJitOptimizeType = OPT_DEFAULT;
     fJitFramed = false;
-    fJitAlignLoops = false;
     fJitMinOpts = false;
     fPInvokeRestoreEsp = (DWORD)-1;
 
@@ -689,7 +688,6 @@ fTrackDynamicMethodDebugInfo = CLRConfig::GetConfigValue(CLRConfig::UNSUPPORTED_
     dwJitHostMaxSlabCache = CLRConfig::GetConfigValue(CLRConfig::EXTERNAL_JitHostMaxSlabCache);
 
     fJitFramed = (GetConfigDWORD_DontUse_(CLRConfig::UNSUPPORTED_JitFramed, fJitFramed) != 0);
-    fJitAlignLoops = (GetConfigDWORD_DontUse_(CLRConfig::UNSUPPORTED_JitAlignLoops, fJitAlignLoops) != 0);
     fJitMinOpts = (GetConfigDWORD_DontUse_(CLRConfig::UNSUPPORTED_JITMinOpts, fJitMinOpts) == 1);
     iJitOptimizeType      =  GetConfigDWORD_DontUse_(CLRConfig::EXTERNAL_JitOptimizeType, iJitOptimizeType);
     if (iJitOptimizeType > OPT_RANDOM)     iJitOptimizeType = OPT_DEFAULT;
diff --git a/src/coreclr/vm/eeconfig.h b/src/coreclr/vm/eeconfig.h
index 46616fa1f5d002..a068e447117e18 100644
--- a/src/coreclr/vm/eeconfig.h
+++ b/src/coreclr/vm/eeconfig.h
@@ -75,7 +75,6 @@ class EEConfig
     bool          GetTrackDynamicMethodDebugInfo(void)      const {LIMITED_METHOD_CONTRACT;  return fTrackDynamicMethodDebugInfo; }
     unsigned int  GenOptimizeType(void)                     const {LIMITED_METHOD_CONTRACT;  return iJitOptimizeType; }
     bool          JitFramed(void)                           const {LIMITED_METHOD_CONTRACT;  return fJitFramed; }
-    bool          JitAlignLoops(void)                       const {LIMITED_METHOD_CONTRACT;  return fJitAlignLoops; }
     bool          JitMinOpts(void)                          const {LIMITED_METHOD_CONTRACT;  return fJitMinOpts; }
 
     // Tiered Compilation config
@@ -537,7 +536,6 @@ class EEConfig
     DWORD dwJitHostMaxSlabCache;       // max size for jit host slab cache
     bool fTrackDynamicMethodDebugInfo; //  Enable/Disable tracking dynamic method debug info
     bool fJitFramed;                   // Enable/Disable EBP based frames
-    bool fJitAlignLoops;               // Enable/Disable loop alignment
     bool fJitMinOpts;                  // Enable MinOpts for all jitted methods
 
     unsigned iJitOptimizeType; // 0=Blended,1=SmallCode,2=FastCode,              default is 0=Blended
diff --git a/src/coreclr/vm/jitinterface.cpp b/src/coreclr/vm/jitinterface.cpp
index 0d60059283a3d0..aa60a55ceb3e27 100644
--- a/src/coreclr/vm/jitinterface.cpp
+++ b/src/coreclr/vm/jitinterface.cpp
@@ -12676,8 +12676,6 @@ CorJitResult CallCompileMethodWithSEHWrapper(EEJitManager *jitMgr,
     CORJIT_FLAGS flags;
     if (g_pConfig->JitFramed())
         flags.Set(CORJIT_FLAGS::CORJIT_FLAG_FRAMED);
-    if (g_pConfig->JitAlignLoops())
-        flags.Set(CORJIT_FLAGS::CORJIT_FLAG_ALIGN_LOOPS);
 #ifdef TARGET_X86
     if (g_pConfig->PInvokeRestoreEsp(ftn->GetModule()->IsPreV4Assembly()))
         flags.Set(CORJIT_FLAGS::CORJIT_FLAG_PINVOKE_RESTORE_ESP);

From 1c85c3c0c85c3ee8762f6ebec799c9c269e3ba18 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 3 Dec 2020 17:21:41 -0800
Subject: [PATCH 33/59] Review feedback

- Do not align loop if it has call
- Created `emitSetLoopBackEdge()` to isolate `emitCurIG` inside emitter class
- Created `emitOutputAlign()` to move the align instruction output logic
- Renamed emitVariableeLoopAlign() to emitLongLoopAlign()
- Created `optIdentifyLoopsForAlignment()` to identify loops that need alignment
- Added comments at various places
---
 src/coreclr/jit/block.cpp         |   2 +-
 src/coreclr/jit/block.h           |   2 +-
 src/coreclr/jit/codegenlinear.cpp |  60 ++--
 src/coreclr/jit/compiler.h        |  36 ++-
 src/coreclr/jit/emit.cpp          | 158 +++++-----
 src/coreclr/jit/emit.h            |   3 +-
 src/coreclr/jit/emitxarch.cpp     | 503 +++++++++++++++---------------
 src/coreclr/jit/emitxarch.h       |   3 +-
 src/coreclr/jit/flowgraph.cpp     |  15 +-
 src/coreclr/jit/optimizer.cpp     |  56 +++-
 10 files changed, 450 insertions(+), 388 deletions(-)

diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp
index ff3902ed564ef7..aa5a72cbfff63b 100644
--- a/src/coreclr/jit/block.cpp
+++ b/src/coreclr/jit/block.cpp
@@ -505,7 +505,7 @@ void BasicBlock::dspFlags()
     {
         printf("cfe ");
     }
-    if (bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP)
+    if (bbFlags & BBF_LOOP_ALIGN)
     {
         printf("finnerloop ");
     }
diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h
index 41b0ba8a0d94da..9f16020d93c3a7 100644
--- a/src/coreclr/jit/block.h
+++ b/src/coreclr/jit/block.h
@@ -448,7 +448,7 @@ struct BasicBlock : private LIR::Range
 
 #define BBF_PATCHPOINT                     MAKE_BBFLAG(36) // Block is a patchpoint
 #define BBF_HAS_CLASS_PROFILE              MAKE_BBFLAG(37) // BB contains a call needing a class profile
-#define BBF_FIRST_BLOCK_IN_INNERLOOP       MAKE_BBFLAG(39) // Block is lexically the fist block within the innermost loop.
+#define BBF_LOOP_ALIGN                     MAKE_BBFLAG(39) // Block is lexically the fist block within the innermost loop.
 
 // clang-format on
 
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index cdf2f25d3e2284..5af9c5b68f816d 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -735,32 +735,21 @@ void CodeGen::genCodeForBBlist()
 
             case BBJ_ALWAYS:
                 inst_JMP(EJ_jmp, block->bbJumpDest);
-                __fallthrough;
+                FALLTHROUGH;
+
             case BBJ_COND:
-                if (block->bbJumpDest->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP)
+
+                // This is the last place where we operate on blocks and after this, we operate
+                // on IG. Hence, if we know that the destination of "block" is the first block
+                // of a loop and needs alignment (it has BBF_LOOP_ALIGN), then "block" represents
+                // end of the loop. Propagate that information on the IG through "igLoopBackEdge".
+                //
+                // During emitter, this information will be used to calculate the loop size.
+                // Depending on the loop size, decision of whether to align a loop or not will be taken.
+
+                if (block->bbJumpDest->bbFlags & BBF_LOOP_ALIGN)
                 {
-                    // Track the destination IG which is the first block of inner loop.
-                    // In emitter, this will be used to calculate total instructions present
-                    // in all IGs that participate in a loop.
-
-                    insGroup* srcIG = GetEmitter()->emitCurIG;
-                    insGroup* dstIG = (insGroup*)block->bbJumpDest->bbEmitCookie;
-
-                    // Only track back edges to the loop.
-                    // Here dstIG != nullptr checks if we have already generated dstIG for a block.
-                    // If block->bbJumpDest was a forward block, it might have not been created yet.
-                    // We don't rely on (block->bbJumpDest->bbNum <= block->bbNum) because the basic
-                    // block numbering is not guaranteed to be sequential.
-                    if (dstIG != nullptr && dstIG->igNum <= srcIG->igNum)
-                    {
-                        srcIG->igLoopBackEdge = dstIG;
-#ifdef DEBUG
-                        if (verbose)
-                        {
-                            printf("** IG_%d jumps back to IG_%d forming a loop.\n", srcIG->igNum, dstIG->igNum);
-                        }
-#endif
-                    }
+                    GetEmitter()->emitSetLoopBackEdge((insGroup*)block->bbJumpDest->bbEmitCookie);
                 }
                 break;
 
@@ -770,13 +759,23 @@ void CodeGen::genCodeForBBlist()
         }
 
 #if defined(TARGET_XARCH)
-        if ((block->bbNext != nullptr) && (block->bbNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP))
+
+        // If next block is the first block of a loop (identified by BBF_LOOP_ALIGN),
+        // then need to add align instruction in current "block". Also mark the
+        // corresponding IG with IGF_ALIGN_LOOP to know that there will be align
+        // instructions at the end of that IG.
+        //
+        // For non-adaptive alignment, add alignment instruction of size depending on the
+        // compJitAlignLoopBoundary.
+        // For adaptive alignment, alignment instruction will always be of 15 bytes.
+
+        if ((block->bbNext != nullptr) && (block->bbNext->bbFlags & BBF_LOOP_ALIGN))
         {
             assert(ShouldAlignLoops());
 
             if ((compiler->opts.compJitAlignLoopBoundary > 16) && (!compiler->opts.compJitAlignLoopAdaptive))
             {
-                GetEmitter()->emitVariableLoopAlign(compiler->opts.compJitAlignLoopBoundary);
+                GetEmitter()->emitLongLoopAlign(compiler->opts.compJitAlignLoopBoundary);
             }
             else
             {
@@ -787,13 +786,8 @@ void CodeGen::genCodeForBBlist()
             // all IGs that follows this IG and participate in a loop.
             GetEmitter()->emitCurIG->igFlags |= IGF_ALIGN_LOOP;
 
-#if defined(DEBUG)
-            if (verbose)
-            {
-                printf("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u to align loop header block.\n" FMT_BB,
-                       compiler->opts.compJitAlignLoopBoundary, compiler->compMethodID, GetEmitter()->emitCurIG->igNum);
-            }
-#endif
+            JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u to align loop header block.\n" FMT_BB,
+                    compiler->opts.compJitAlignLoopBoundary, compiler->compMethodID, GetEmitter()->emitCurIG->igNum);
         }
 #endif
 
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index acd2d101354fbe..5c05d0f12aa33e 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -6367,6 +6367,8 @@ class Compiler
 
     void optFindNaturalLoops();
 
+    void optIdentifyLoopsForAlignment();
+
     // Ensures that all the loops in the loop nest rooted at "loopInd" (an index into the loop table) are 'canonical' --
     // each loop has a unique "top."  Returns "true" iff the flowgraph has been modified.
     bool optCanonicalizeLoopNest(unsigned char loopInd);
@@ -9036,26 +9038,40 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
         bool dspGCtbls;       // Display the GC tables
 #endif
 
+        // Default numbers used to perform loop alignment. All the numbers are choosen
+        // based on experimenting with various benchmarks.
+
+        // Default minimum loop block weight required to enable loop alignment.
 #define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 10
+
+        // By default a loop will be aligned at 32B address boundary to get better
+        // performance as per architecture manuals.
 #define DEFAULT_ALIGN_LOOP_BOUNDARY 0x20
+
+        // For non-adaptive loop alignment, by default, only align a loop whose size is
+        // atmost 3 times of 32B chunk. If the loop is bigger than that, it is most
+        // likely the loop code is complicated enough and aligning such loop will not help
+        // much.
 #define DEFAULT_MAX_LOOPSIZE_FOR_ALIGN DEFAULT_ALIGN_LOOP_BOUNDARY * 3
 
 #ifdef DEBUG
         // Loop alignment variables
-        bool compJitAlignLoopForJcc;             // If set, for non-adaptive alignment, ensure loop jmps are not on or
-                                                 // cross alignment boundary.
+
+        // If set, for non-adaptive alignment, ensure loop jmps are not on or cross alignment boundary.
+        bool compJitAlignLoopForJcc;
 #endif
-        unsigned short compJitAlignLoopMaxCodeSize; // For non-adaptive alignment, minimum loop size (in bytes) for which
-                                              // alignment will be done.
+        // For non-adaptive alignment, minimum loop size (in bytes) for which alignment will be done.
+        unsigned short compJitAlignLoopMaxCodeSize;
 
-        unsigned short compJitAlignLoopMinBlockWeight; // Minimum weight needed for the first block of a loop to make it a
-                                                 // candidate for alignment.
+        // Minimum weight needed for the first block of a loop to make it a candidate for alignment.
+        unsigned short compJitAlignLoopMinBlockWeight;
 
-        unsigned short compJitAlignLoopBoundary; // For non-adaptive alignment, address boundary (power of 2) at which
-                                           // loop alignment should be done. By default, 32B.
+        // For non-adaptive alignment, address boundary (power of 2) at which loop alignment should
+        // be done. By default, 32B.
+        unsigned short compJitAlignLoopBoundary;
 
-        bool compJitAlignLoopAdaptive; // If set, perform adaptive loop alignment that limits number of padding
-                                       // based on loop size.
+        // If set, perform adaptive loop alignment that limits number of padding based on loop size.
+        bool compJitAlignLoopAdaptive;
 
 #ifdef LATE_DISASM
         bool doLateDisasm; // Run the late disassembler
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index b48519a3d39be3..bb908c5d1de795 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -3613,6 +3613,10 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
 {
     size_t is;
 
+#ifdef DEBUG
+    size_t     beforeAddr = (size_t)*dp;
+#endif
+
     /* Record the beginning offset of the instruction */
 
     BYTE* curInsAdr = *dp;
@@ -3651,12 +3655,12 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
 
     if (csz != id->idCodeSize())
     {
-        /* It is fatal to under-estimate the instruction size, except it was an alignment instruction */
+        // It is fatal to under-estimate the instruction size, except for alignment instructions
         bool validCodeSize = id->idCodeSize() >= csz;
 
 #if defined(TARGET_XARCH)
-        validCodeSize |= (!emitComp->opts.compJitAlignLoopAdaptive && id->idIns() == INS_align &&
-                          emitComp->opts.compJitAlignLoopBoundary > 16);
+        validCodeSize |= (!emitComp->opts.compJitAlignLoopAdaptive && (id->idIns() == INS_align) &&
+                          (emitComp->opts.compJitAlignLoopBoundary > 16));
 #endif
         noway_assert(validCodeSize);
 
@@ -3667,7 +3671,6 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
         }
 #endif // DEBUG_EMIT
 
-
         /* The instruction size estimate wasn't accurate; remember this */
 
         ig->igFlags |= IGF_UPD_ISZ;
@@ -3691,6 +3694,51 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
                id->idDebugOnlyInfo()->idNum, is, emitSizeOfInsDsc(id));
         assert(is == emitSizeOfInsDsc(id));
     }
+
+    // Print the alignment boundary
+    if ((emitComp->opts.disAsm || emitComp->verbose) && emitComp->opts.disAddr)
+    {
+        size_t currAddr         = (size_t)*dp;
+        size_t lastBoundaryAddr = currAddr & ~((size_t)emitComp->opts.compJitAlignLoopBoundary - 1);
+
+        // draw boundary if beforeAddr was before the lastBoundary.
+        if (beforeAddr < lastBoundaryAddr)
+        {
+            printf("; ");
+            instruction currIns = id->idIns();
+
+#if defined(TARGET_XARCH)
+
+            // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
+            bool isJccAffectedIns =
+                ((currIns >= INS_i_jmp && currIns < INS_align) || (currIns == INS_call) || (currIns == INS_ret));
+
+            instrDesc* nextId = id;
+            castto(nextId, BYTE*) += is;
+            instruction nextIns = nextId->idIns();
+            if ((currIns == INS_cmp) || (currIns == INS_test) || (currIns == INS_add) || (currIns == INS_sub) ||
+                (currIns == INS_and) || (currIns == INS_inc) || (currIns == INS_dec))
+            {
+                isJccAffectedIns |= (nextIns >= INS_i_jmp && nextIns < INS_align);
+            }
+#else
+            bool isJccAffectedIns = false;
+#endif
+
+            // Indicate if instruction is at at 32B boundary or is splitted
+            unsigned bytesCrossedBoundary = (currAddr & (emitComp->opts.compJitAlignLoopBoundary - 1));
+            if ((bytesCrossedBoundary != 0) || (isJccAffectedIns && bytesCrossedBoundary == 0))
+            {
+                printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", codeGen->genInsName(id->idIns()),
+                       bytesCrossedBoundary);
+            }
+            else
+            {
+                printf("...............................");
+            }
+            printf(" %dB boundary ...............................\n", (emitComp->opts.compJitAlignLoopBoundary));
+        }
+    }
 #endif
 
     return is;
@@ -4487,18 +4535,17 @@ void emitter::emitJumpDistBind()
 }
 
 
-/*****************************************************************************
- *  For loopHeaderIg, find the size of the smallest possible loop that doesn't exceed maxLoopSize.
- */
-
-unsigned emitter::getLoopSize(insGroup* loopHeaderIg, unsigned maxLoopSize)
+//-----------------------------------------------------------------------------
+//  For loopHeaderIg, find the size of the smallest possible loop that doesn't exceed maxLoopSize.
+//
+unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize)
 {
     unsigned  loopSize     = 0;
 
-    for (insGroup* igInLoop = loopHeaderIg; igInLoop; igInLoop = igInLoop->igNext)
+    for (insGroup* igInLoop = igLoopHeader; igInLoop != nullptr; igInLoop = igInLoop->igNext)
     {
         loopSize += igInLoop->igSize;
-        if (igInLoop->igLoopBackEdge == loopHeaderIg || loopSize > maxLoopSize)
+        if ((igInLoop->igLoopBackEdge == igLoopHeader) || (loopSize > maxLoopSize))
         {
             break;
         }
@@ -4507,12 +4554,30 @@ unsigned emitter::getLoopSize(insGroup* loopHeaderIg, unsigned maxLoopSize)
     return loopSize;
 }
 
-/*****************************************************************************
- *  For IGs that adds padding to align loops, calculate the loop size and if it exceed the
-    threshold, then mark that alignment is not needed and hence adjust the igOffs, igSize
-    and emitTotalCodeSize.
-*/
+//-----------------------------------------------------------------------------
+// emitCurIG jumps back to dstIG forming a loop. Set appropriate field to
+// record that information
+//
+void emitter::emitSetLoopBackEdge(insGroup* dstIG)
+{
+    // Only track back edges to the loop.
+    // Here dstIG != nullptr checks if we have already generated dstIG for a block.
+    // If block->bbJumpDest was a forward block, it might have not been created yet.
+    // We don't rely on (block->bbJumpDest->bbNum <= block->bbNum) because the basic
+    // block numbering is not guaranteed to be sequential.
+    if ((dstIG != nullptr) && (dstIG->igNum <= emitCurIG->igNum))
+    {
+        emitCurIG->igLoopBackEdge = dstIG;
+
+        JITDUMP("** IG_%d jumps back to IG_%d forming a loop.\n", emitCurIG->igNum, dstIG->igNum);
+    }
+}
 
+//-----------------------------------------------------------------------------
+//  For IGs that adds padding to align loops, calculate the loop size and if it exceed the
+//  threshold, then mark that alignment is not needed and hence adjust the igOffs, igSize
+//  and emitTotalCodeSize.
+//
 void emitter::emitLoopAlignAdjustments()
 {
 #ifdef TARGET_XARCH
@@ -4544,7 +4609,8 @@ void emitter::emitLoopAlignAdjustments()
             continue;
         }
 
-        if (getLoopSize(ig->igNext, maxLoopSize) > maxLoopSize)
+        unsigned loopSize = getLoopSize(ig->igNext, maxLoopSize);
+        if (loopSize > maxLoopSize)
         {
             assert(ig->igSize >= maxPaddingAdded);
 
@@ -4556,12 +4622,8 @@ void emitter::emitLoopAlignAdjustments()
             ig->igFlags |= IGF_UPD_ISZ;
             ig->igFlags &= ~IGF_ALIGN_LOOP;
 
-#if DEBUG
-            if (emitComp->verbose)
-            {
-                printf("Removed loop alignment from G_M%03u_IG%02u: 'MaxLoopSize= %d\n", emitComp->compMethodID, ig->igNum, maxLoopSize);
-            }
-#endif
+            JITDUMP("Removed loop alignment from G_M%03u_IG%02u: 'LoopSize= %d, MaxLoopSize= %d\n",
+                    emitComp->compMethodID, ig->igNum, loopSize, maxLoopSize);
         }
     }
 #endif
@@ -5294,57 +5356,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
 
         for (unsigned cnt = ig->igInsCnt; cnt; cnt--)
         {
-#ifdef DEBUG
-            size_t     lastCp = (size_t)cp;
-            instrDesc* lastId = id;
-#endif
             castto(id, BYTE*) += emitIssue1Instr(ig, id, &cp);
-#ifdef DEBUG
-
-            if ((emitComp->opts.disAsm || emitComp->verbose) && emitComp->opts.disAddr)
-            {
-                size_t lastBoundaryAddr = (size_t)cp & ~((size_t)emitComp->opts.compJitAlignLoopBoundary - 1);
-
-                // draw boundary if lastCp was before the lastBoundary.
-                if (lastCp < lastBoundaryAddr)
-                {
-                    printf("; ");
-                    instruction lastIns = lastId->idIns();
-
-#if defined(TARGET_XARCH)
-                    // https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
-                    bool isJccAffectedIns = ((lastIns >= INS_i_jmp && lastIns < INS_align) || (lastIns == INS_call) ||
-                                             (lastIns == INS_ret));
-                    if (cnt)
-                    {
-                        instruction currIns = id->idIns();
-                        if ((lastIns == INS_cmp) || (lastIns == INS_test) || (lastIns == INS_add) ||
-                            (lastIns == INS_sub) || (lastIns == INS_and) || (lastIns == INS_inc) ||
-                            (lastIns == INS_dec))
-                        {
-                            isJccAffectedIns |= (currIns >= INS_i_jmp && currIns < INS_align);
-                        }
-                    }
-#else
-                    bool isJccAffectedIns = false;
-#endif
-
-                    // Indicate if instruction is at or split at 32B boundary
-                    unsigned bytesCrossedBoundary = ((size_t)cp & 0x1f);
-                    if ((bytesCrossedBoundary != 0) || (isJccAffectedIns && bytesCrossedBoundary == 0))
-                    {
-                        printf("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ (%s: %d)", codeGen->genInsName(lastId->idIns()),
-                               bytesCrossedBoundary);
-                    }
-                    else
-                    {
-                        printf("...............................");
-                    }
-                    printf(" %dB boundary ...............................\n",
-                           (emitComp->opts.compJitAlignLoopBoundary));
-                }
-            }
-#endif
         }
 
 #ifdef DEBUG
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index afe465ad546522..09eda0c1f0293e 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -1741,7 +1741,8 @@ class emitter
     instrDescJmp* emitJumpLast;       // last of local jumps in method
     void          emitJumpDistBind(); // Bind all the local jumps in method
 
-    unsigned getLoopSize(insGroup* loopHeaderIg, unsigned maxLoopSize); // Get the smallest loop size
+    unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize); // Get the smallest loop size
+    void     emitSetLoopBackEdge(insGroup* dstIG);
     void     emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments
 
     void emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG); // Check for illegal branches between funclets
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index a33a1ba6e6c2c2..cc73445fa74a08 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2651,13 +2651,12 @@ emitter::instrDesc* emitter::emitNewInstrAmdCns(emitAttr size, ssize_t dsp, int
     }
 }
 
-/*****************************************************************************
- *
- *  The next instruction will be a loop head entry point
- *  So insert a dummy instruction here to ensure that
- *  the x86 I-cache alignment rule is followed.
- */
-
+//-----------------------------------------------------------------------------
+//
+//  The next instruction will be a loop head entry point
+//  So insert a dummy instruction here to ensure that
+//  the x86 I-cache alignment rule is followed.
+//
 void emitter::emitLoopAlign()
 {
     /* Insert a pseudo-instruction to ensure that we align
@@ -2669,14 +2668,16 @@ void emitter::emitLoopAlign()
     emitCurIGsize += 15;
 }
 
-/*****************************************************************************
- *
- *  The next instruction will be a loop head entry point
- *  So insert a dummy instruction here to ensure that
- *  the x86 I-cache alignment rule is followed.
- */
-
-void emitter::emitVariableLoopAlign(unsigned short alignmentBoundary)
+//-----------------------------------------------------------------------------
+//
+//  The next instruction will be a loop head entry point
+//  So insert a dummy instruction here to ensure that
+//  the x86 I-cache alignment rule is followed.
+//
+//  This emits more than one `INS_align` instruction depending on the
+//  alignmentBoundary parameter.
+//
+void emitter::emitLongLoopAlign(unsigned short alignmentBoundary)
 {
     unsigned short nPaddingBytes    = alignmentBoundary - 1;
     unsigned short nAlignInstr      = (nPaddingBytes + (15 - 1)) / 15;
@@ -9364,6 +9365,243 @@ static BYTE* emitOutputNOP(BYTE* dst, size_t nBytes)
     return dst;
 }
 
+BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, size_t sz, BYTE* dst)
+{
+    // Candidate for loop alignment
+    assert(codeGen->ShouldAlignLoops());
+
+    unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
+    bool     skipPadding       = false;
+
+#if DEBUG
+    bool displayAlignmentDetails = (emitComp->opts.disAsm /*&& emitComp->opts.disAddr*/) || emitComp->verbose;
+#endif
+    // Check if we already detected that this IG does not need alignment
+    if ((ig->igFlags & IGF_ALIGN_LOOP) == 0)
+    {
+        skipPadding = true;
+#if DEBUG
+        if (displayAlignmentDetails)
+        {
+            printf("\t\t;; Skip alignment: 'Big loop.' in (%s)\n", emitComp->info.compFullName);
+        }
+#endif
+    }
+
+    // Check if the loop is already at alignment boundary
+    if (((size_t)dst & (alignmentBoundary - 1)) == 0)
+    {
+        skipPadding = true;
+#if DEBUG
+        if (displayAlignmentDetails)
+        {
+            printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n", alignmentBoundary,
+                    emitComp->info.compMethodName);
+        }
+#endif
+    }
+
+    unsigned paddingToAdd = 0;
+    if (!skipPadding)
+    {
+        // Adaptive padding
+        if (emitComp->opts.compJitAlignLoopAdaptive)
+        {
+            // Start to align on 32B boundary with a fallback to 16B boundary
+            int      maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
+            unsigned maxLoopSize             = alignmentBoundary * maxBlocksAllowedForLoop;
+            unsigned loopSize                = getLoopSize(ig->igNext, maxLoopSize);
+            unsigned minBlocksNeededForLoop  = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+            unsigned nMaxPaddingBytes        = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
+            unsigned nPaddingBytes           = (-(int)(size_t)dst) & (alignmentBoundary - 1);
+
+            // Check if the loop exceed maxSize
+            if (loopSize > maxLoopSize)
+            {
+                skipPadding = true;
+                assert(!"Should never hit maxLoopSize threshold because it should have been predicted "
+                        "earlier.");
+            }
+
+            // Check if the alignment exceeds maxPadding limit
+            else if (nPaddingBytes > nMaxPaddingBytes)
+            {
+                // Cannot align to 32B, so try to align to 16B boundary.
+                alignmentBoundary >>= 1;
+                nMaxPaddingBytes = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1);
+                nPaddingBytes    = (-(int)(size_t)dst) & (alignmentBoundary - 1);
+
+                // Check if the loop is already at new alignment boundary
+                if (nPaddingBytes == 0)
+                {
+                    skipPadding = true;
+#if DEBUG
+                    if (displayAlignmentDetails)
+                    {
+                        printf("\t\t;; Skip alignment: 'Loop already aligned at 16B boundary.' in (%s)\n",
+                                emitComp->info.compMethodName);
+                    }
+#endif
+                }
+                // Check if the alignment exceeds new maxPadding limit
+                else if (nPaddingBytes > nMaxPaddingBytes)
+                {
+                    skipPadding = true;
+#if DEBUG
+                    if (displayAlignmentDetails)
+                    {
+                        printf("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
+                                "AlignmentBoundary= %dB.' in (%s)\n",
+                                nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary,
+                                emitComp->info.compFullName);
+                    }
+#endif
+                }
+            }
+
+            if (!skipPadding)
+            {
+                // Padding is needed only if loop starts at or after the current offset.
+                // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+                size_t extraBytesNotInLoop =
+                    (size_t)(emitComp->opts.compJitAlignLoopBoundary * minBlocksNeededForLoop) - loopSize;
+                size_t currentOffset = (size_t)dst % alignmentBoundary;
+
+                // Check if loop starts from offset such that padding can be skipped.
+                if (currentOffset <= extraBytesNotInLoop)
+                {
+                    skipPadding = true;
+#if DEBUG
+                    if (displayAlignmentDetails)
+                    {
+                        printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n",
+                                alignmentBoundary, emitComp->info.compMethodName);
+                    }
+#endif
+                }
+                else
+                {
+                    // Perform the padding
+                    paddingToAdd = nPaddingBytes;
+                }
+            }
+        }
+        // Non-adaptive padding
+        else
+        {
+            instrDesc* nextId = id;
+            castto(nextId, BYTE*) += sz;
+
+            // For padding > 15 bytes, check if we already performed/skipped
+            // padding during previous INS_align instruction.
+            // If yes, skip for current instruction as well as next, if that
+            // too is INS_align.
+            if ((id->idCodeSize() == 0))
+            {
+                if (nextId->idIns() == INS_align)
+                {
+                    assert(alignmentBoundary > 16);
+                    nextId->idCodeSize(0);
+                }
+                return dst;
+            }
+
+            unsigned short maxLoopSize         = emitComp->opts.compJitAlignLoopMaxCodeSize;
+            unsigned       loopSize            = getLoopSize(ig->igNext, maxLoopSize);
+            unsigned       minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+            unsigned       extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
+            unsigned       currentOffset       = (size_t)dst % alignmentBoundary;
+
+#ifdef DEBUG
+            // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary
+            if (emitComp->opts.compJitAlignLoopForJcc)
+            {
+                // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing?
+                currentOffset++;
+            }
+#endif
+            // Check if the loop exceed maxSize
+            if (loopSize > maxLoopSize)
+            {
+                skipPadding = true;
+                assert(!"Should never hit maxLoopSize threshold because it should have been predicted "
+                        "earlier.");
+            }
+
+            // Padding is needed only if loop starts at or after the current offset.
+            // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+            else if (currentOffset <= extraBytesNotInLoop)
+            {
+                skipPadding = true;
+#if DEBUG
+                if (displayAlignmentDetails)
+                {
+                    printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n",
+                            alignmentBoundary, emitComp->info.compMethodName);
+                }
+#endif
+            }
+            else
+            {
+                // Perform the padding
+                paddingToAdd = (-(int)(size_t)dst) & (alignmentBoundary - 1);
+            }
+
+            // For padding > 15 bytes, multiple INS_align(15) are emitted.
+            // If decided to skipPadding, just mark it so for future INS_align
+            // instructions as well.
+            if (!skipPadding)
+            {
+                if (nextId->idIns() == INS_align)
+                {
+                    assert(alignmentBoundary > 16);
+                    nextId->idCodeSize(0);
+                }
+            }
+        }
+    }
+
+    // Add the padding, if needed.
+    if (paddingToAdd > 0)
+    {
+        assert(!skipPadding);
+        assert(((size_t)dst & (alignmentBoundary - 1)) != 0);
+
+        unsigned padCounts   = paddingToAdd / 15;
+        unsigned lastPadding = paddingToAdd % 15;
+
+        // TODO: For padding > 15 bytes, evaluate the sequence of NOPs emitted
+        //      and see if they can be improved.
+        while (padCounts)
+        {
+            dst = emitOutputNOP(dst, 15);
+            padCounts--;
+        }
+
+        dst = emitOutputNOP(dst, lastPadding);
+
+#if DEBUG
+        if (displayAlignmentDetails)
+        {
+            printf("\t\t;; Add alignment: 'Padding= %d, AlignmentBoundary= %dB.' in (%s)\n", paddingToAdd,
+                    alignmentBoundary, emitComp->info.compFullName);
+        }
+#endif
+        // In the end dst should be at alignment boundary
+        assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
+    }
+
+    // If we didn't add as much padding as we thought, update the code size and flag.
+    if (paddingToAdd != id->idCodeSize())
+    {
+        assert(paddingToAdd != 0 || skipPadding);
+        id->idCodeSize(paddingToAdd);
+        ig->igFlags |= IGF_UPD_ISZ;
+    }
+
+    return dst;
+}
+
 /*****************************************************************************
  *
  *  Output an instruction involving an address mode.
@@ -12637,239 +12875,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             // the loop alignment pseudo instruction
             if (ins == INS_align)
             {
-                // Candidate for loop alignment
-                assert(codeGen->ShouldAlignLoops());
-
-                unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
-                bool     skipPadding       = false;
-                sz                         = SMALL_IDSC_SIZE;
-
-#if DEBUG
-                bool displayAlignmentDetails =
-                    (emitComp->opts.disAsm /*&& emitComp->opts.disAddr*/) || emitComp->verbose;
-#endif
-               // Check if we already detected that this IG do not need alignment
-               if ((ig->igFlags & IGF_ALIGN_LOOP) == 0)
-                {
-                    skipPadding = true;
-#if DEBUG
-                    if (displayAlignmentDetails)
-                    {
-                        printf("\t\t;; Skip alignment: 'Big loop.' in (%s)\n", emitComp->info.compFullName);
-                    }
-#endif
-                }
-
-                // Check if the loop is already at alignment boundary
-                if (((size_t)dst & (alignmentBoundary - 1)) == 0)
-                {
-                    skipPadding = true;
-#if DEBUG
-                    if (displayAlignmentDetails)
-                    {
-                        printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n",
-                               alignmentBoundary, emitComp->info.compMethodName);
-                    }
-#endif
-                }
-
-                unsigned paddingToAdd = 0;
-                if (!skipPadding)
-                {
-                    // Adaptive padding
-                    if (emitComp->opts.compJitAlignLoopAdaptive)
-                    {
-                        // Start to align on 32B boundary with a fallback to 16B boundary
-                        int      maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
-                        unsigned maxLoopSize             = alignmentBoundary * maxBlocksAllowedForLoop;
-                        unsigned loopSize                = getLoopSize(ig->igNext, maxLoopSize);
-                        unsigned minBlocksNeededForLoop  = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
-                        unsigned nMaxPaddingBytes = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
-                        unsigned nPaddingBytes    = (-(int)(size_t)dst) & (alignmentBoundary - 1);
-
-                        // Check if the loop exceed maxSize
-                        if (loopSize > maxLoopSize)
-                        {
-                            skipPadding = true;
-                            assert(!"Should never hit maxLoopSize threshold because it should have been predicted "
-                                    "earlier.");
-                        }
-
-                        // Check if the alignment exceeds maxPadding limit
-                        else if (nPaddingBytes > nMaxPaddingBytes)
-                        {
-                            // Cannot align to 32B, so try to align to 16B boundary.
-                            alignmentBoundary >>= 1;
-                            nMaxPaddingBytes  = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1);
-                            nPaddingBytes     = (-(int)(size_t)dst) & (alignmentBoundary - 1);
-
-                            // Check if the loop is already at new alignment boundary
-                            if (nPaddingBytes == 0)
-                            {
-                                skipPadding = true;
-#if DEBUG
-                                if (displayAlignmentDetails)
-                                {
-                                    printf("\t\t;; Skip alignment: 'Loop already aligned at 16B boundary.' in (%s)\n",
-                                           emitComp->info.compMethodName);
-                                }
-#endif
-                            }
-                            // Check if the alignment exceeds new maxPadding limit
-                            else if (nPaddingBytes > nMaxPaddingBytes)
-                            {
-                                skipPadding = true;
-#if DEBUG
-                                if (displayAlignmentDetails)
-                                {
-                                    printf("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
-                                           "AlignmentBoundary= %dB.' in (%s)\n",
-                                           nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary,
-                                           emitComp->info.compFullName);
-                                }
-#endif
-                            }
-                        }
-
-                        if (!skipPadding)
-                        {
-                            // Padding is needed only if loop starts at or after the current offset.
-                            // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
-                            size_t extraBytesNotInLoop = (size_t)(32 * minBlocksNeededForLoop) - loopSize; // For calculation, use 32B chunks
-                            size_t currentOffset = (size_t)dst % alignmentBoundary;
-
-                            // Check if loop starts from offset such that padding can be skipped.
-                            if (currentOffset <= extraBytesNotInLoop)
-                            {
-                                skipPadding = true;
-#if DEBUG
-                                if (displayAlignmentDetails)
-                                {
-                                    printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n",
-                                          alignmentBoundary, emitComp->info.compMethodName);
-                                }
-#endif
-                            }
-                            else
-                            {
-                                // Perform the padding
-                                paddingToAdd = nPaddingBytes;
-                            }
-                        }
-                    }
-                    // Non-adaptive padding
-                    else
-                    {
-                        instrDesc* nextId = id;
-                        castto(nextId, BYTE*) += sz;
-
-                        // For padding > 15 bytes, check if we already performed/skipped
-                        // padding during previous INS_align instruction.
-                        // If yes, skip for current instruction as well as next, if that
-                        // too is INS_align.
-                        if ((id->idCodeSize() == 0))
-                        {
-                            if (nextId->idIns() == INS_align)
-                            {
-                                assert(alignmentBoundary > 16);
-                                nextId->idCodeSize(0);
-                            }
-                            break;
-                        }
-
-                        unsigned short maxLoopSize   = emitComp->opts.compJitAlignLoopMaxCodeSize;
-                        unsigned loopSize            = getLoopSize(ig->igNext, maxLoopSize);
-                        unsigned minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
-                        unsigned extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
-                        unsigned currentOffset       = (size_t)dst % alignmentBoundary;
-
-#ifdef DEBUG
-                        // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary
-                        if (emitComp->opts.compJitAlignLoopForJcc)
-                        {
-                            // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing?
-                            currentOffset++;
-                        }
-#endif
-                        // Check if the loop exceed maxSize
-                        if (loopSize > maxLoopSize)
-                        {
-                            skipPadding = true;
-                            assert(!"Should never hit maxLoopSize threshold because it should have been predicted "
-                                    "earlier.");
-                        }
-
-                        // Padding is needed only if loop starts at or after the current offset.
-                        // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
-                        else if (currentOffset <= extraBytesNotInLoop)
-                        {
-                            skipPadding = true;
-#if DEBUG
-                            if (displayAlignmentDetails)
-                            {
-                                printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n",
-                                       alignmentBoundary, emitComp->info.compMethodName);
-                            }
-#endif
-                        }
-                        else
-                        {
-                            // Perform the padding
-                            paddingToAdd = (-(int)(size_t)dst) & (alignmentBoundary - 1);
-                        }
-
-                        // For padding > 15 bytes, multiple INS_align(15) are emitted.
-                        // If decided to skipPadding, just mark it so for future INS_align
-                        // instructions as well.
-                        if (!skipPadding)
-                        {
-                            if (nextId->idIns() == INS_align)
-                            {
-                                assert(alignmentBoundary > 16);
-                                nextId->idCodeSize(0);
-                            }
-                        }
-                    }
-                }
-
-                // Add the padding, if needed.
-                if (paddingToAdd > 0)
-                {
-                    assert(!skipPadding);
-                    assert(((size_t)dst & (alignmentBoundary - 1)) != 0);
-
-                    unsigned padCounts   = paddingToAdd / 15;
-                    unsigned lastPadding = paddingToAdd % 15;
-
-                    //TODO: For padding > 15 bytes, evaluate the sequence of NOPs emitted
-                    //      and see if they can be improved.
-                    while (padCounts)
-                    {
-                        dst = emitOutputNOP(dst, 15);
-                        padCounts--;
-                    }
-
-                    dst = emitOutputNOP(dst, lastPadding);
-
-#if DEBUG
-                    if (displayAlignmentDetails)
-                    {
-                        printf("\t\t;; Add alignment: 'Padding= %d, AlignmentBoundary= %dB.' in (%s)\n",
-                               paddingToAdd, alignmentBoundary, emitComp->info.compFullName);
-                    }
-#endif
-                    // In the end dst should be at alignment boundary
-                    assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
-                }
-
-                // If we didn't add as much padding as we thought, update the code size and flag.
-                if (paddingToAdd != id->idCodeSize())
-                {
-                    assert(paddingToAdd != 0 || skipPadding);
-                    id->idCodeSize(paddingToAdd);
-                    ig->igFlags |= IGF_UPD_ISZ;
-                }
-
+                sz  = SMALL_IDSC_SIZE;
+                dst = emitOutputAlign(ig, id, sz, dst);
                 break;
             }
 
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index 1eafe7d90a4c60..c27e5c851e2b76 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -50,6 +50,7 @@ UNATIVE_OFFSET emitInsSizeAM(instrDesc* id, code_t code, int val);
 UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code);
 UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code, int val);
 
+BYTE* emitOutputAlign(insGroup* ig, instrDesc* id, size_t sz, BYTE* dst);
 BYTE* emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
 BYTE* emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
 BYTE* emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
@@ -289,7 +290,7 @@ inline emitAttr emitDecodeScale(unsigned ensz)
 public:
 void emitLoopAlign();
 
-void emitVariableLoopAlign(unsigned short alignmentBoundary);
+void emitLongLoopAlign(unsigned short alignmentBoundary);
 
 void emitIns(instruction ins);
 
diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
index 202438f75d80e7..0d7a996f84a422 100644
--- a/src/coreclr/jit/flowgraph.cpp
+++ b/src/coreclr/jit/flowgraph.cpp
@@ -10946,6 +10946,12 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext)
             break;
     }
 
+    // Add the LOOP_ALIGN flag, if applicable
+    if (bNext->bbFlags & BBF_LOOP_ALIGN)
+    {
+        block->bbFlags |= BBF_LOOP_ALIGN;
+    }
+
     // If we're collapsing a block created after the dominators are
     // computed, copy block number the block and reuse dominator
     // information from bNext to block.
@@ -11048,11 +11054,6 @@ void Compiler::fgUpdateLoopsAfterCompacting(BasicBlock* block, BasicBlock* bNext
             optLoopTable[loopNum].lpEntry = block;
         }
     }
-
-    if (bNext->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP)
-    {
-        block->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
-    }
 }
 
 /*****************************************************************************************************
@@ -11552,9 +11553,9 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable)
             skipUnmarkLoop = true;
         }
 
-        if (block->bbFlags & BBF_FIRST_BLOCK_IN_INNERLOOP)
+        if (block->bbFlags & BBF_LOOP_ALIGN)
         {
-            succBlock->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
+            succBlock->bbFlags |= BBF_LOOP_ALIGN;
         }
 
         noway_assert(succBlock);
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 87fb5982827c56..237db340b9896c 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -1653,7 +1653,7 @@ class LoopSearch
 
         if (top->bbNum > bottom->bbNum) // is this a backward edge? (from BOTTOM to TOP)
         {
-            // Edge from TOP to BOTTOM is not a backward edge
+            // Edge from BOTTOM to TOP is not a backward edge
             return false;
         }
 
@@ -2542,18 +2542,6 @@ void Compiler::optFindNaturalLoops()
             }
             assert(blk->bbNext != nullptr); // We should never reach nullptr.
         }
-
-#if defined(TARGET_XARCH)
-        if (codeGen->ShouldAlignLoops())
-        {
-            // An innerloop candidate that might need alignment
-            if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) &&
-                opts.compJitAlignLoopMinBlockWeight <= first->getBBWeight(this))
-            {
-                first->bbFlags |= BBF_FIRST_BLOCK_IN_INNERLOOP;
-            }
-        }
-#endif
     }
 
     // Make sure that loops are canonical: that every loop has a unique "top", by creating an empty "nop"
@@ -2590,6 +2578,33 @@ void Compiler::optFindNaturalLoops()
 #endif // DEBUG
 }
 
+//-----------------------------------------------------------------------------
+//
+// All the inner loops that whose block weight meets a threshold are marked
+// as needing alignment.
+//
+
+void Compiler::optIdentifyLoopsForAlignment()
+{
+#if defined(TARGET_XARCH)
+    if (codeGen->ShouldAlignLoops())
+    {
+        for (unsigned char loopInd = 0; loopInd < optLoopCount; loopInd++)
+        {
+            BasicBlock* first = optLoopTable[loopInd].lpFirst;
+
+            // An innerloop candidate that might need alignment
+            if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) &&
+                opts.compJitAlignLoopMinBlockWeight <= first->getBBWeight(this))
+            {
+                first->bbFlags |= BBF_LOOP_ALIGN;
+                JITDUMP("L%02u that starts at " FMT_BB " needs alignment.\n", loopInd, first->bbNum);
+            }
+        }
+    }
+#endif
+}
+
 void Compiler::optRedirectBlock(BasicBlock* blk, BlockToBlockMap* redirectMap)
 {
     BasicBlock* newJumpDest = nullptr;
@@ -4440,6 +4455,10 @@ void Compiler::optOptimizeLoops()
 
         optFindNaturalLoops();
 
+        // Check if any of the loops need alignment
+
+        optIdentifyLoopsForAlignment();
+
         unsigned loopNum = 0;
 
         /* Iterate over the flow graph, marking all loops */
@@ -7987,12 +8006,23 @@ bool Compiler::optComputeLoopSideEffectsOfBlock(BasicBlock* blk)
 // Marks the containsCall information to "lnum" and any parent loops.
 void Compiler::AddContainsCallAllContainingLoops(unsigned lnum)
 {
+    unsigned nestedLoopNum = lnum;
     assert(0 <= lnum && lnum < optLoopCount);
     while (lnum != BasicBlock::NOT_IN_LOOP)
     {
         optLoopTable[lnum].lpContainsCall = true;
         lnum                              = optLoopTable[lnum].lpParent;
     }
+
+    // If this is the inner most loop, reset the LOOP_ALIGN flag
+    // because a loop having call will not likely to benefit from
+    // alignment
+    if (optLoopTable[nestedLoopNum].lpChild == BasicBlock::NOT_IN_LOOP)
+    {
+        BasicBlock* first = optLoopTable[nestedLoopNum].lpFirst;
+        first->bbFlags &= ~BBF_LOOP_ALIGN;
+        JITDUMP("Skip alignment for L%02u that starts at " FMT_BB " because loop has a call.\n", nestedLoopNum, first->bbNum);
+    }
 }
 
 // Adds the variable liveness information for 'blk' to 'this' LoopDsc

From ef0b149f18240faa63d695a81cf1cf1510f927f7 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 4 Dec 2020 09:09:35 -0800
Subject: [PATCH 34/59] jit format

---
 src/coreclr/jit/compiler.cpp      | 12 ++++++------
 src/coreclr/jit/compiler.h        | 18 +++++++++---------
 src/coreclr/jit/emit.cpp          |  9 ++++-----
 src/coreclr/jit/emit.h            |  4 ++--
 src/coreclr/jit/emitxarch.cpp     | 16 ++++++++--------
 src/coreclr/jit/jitconfigvalues.h |  4 ++--
 src/coreclr/jit/optimizer.cpp     |  3 ++-
 7 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 585fd17a222a1e..18664ef3fa5aa7 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2308,7 +2308,7 @@ void Compiler::compSetProcessor()
     opts.compUseCMOV = jitFlags.IsSet(JitFlags::JIT_FLAG_USE_CMOV);
 #ifdef DEBUG
     if (opts.compUseCMOV)
-        opts.compUseCMOV = !compStressCompile(STRESS_USE_CMOV, 50);
+        opts.compUseCMOV                = !compStressCompile(STRESS_USE_CMOV, 50);
 #endif // DEBUG
 
 #endif // TARGET_X86
@@ -2617,13 +2617,13 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
 
 #ifdef DEBUG
     opts.compJitAlignLoopAdaptive       = JitConfig.JitAlignLoopAdaptive() == 1;
-    opts.compJitAlignLoopBoundary       = (unsigned short) JitConfig.JitAlignLoopBoundary();
-    opts.compJitAlignLoopMinBlockWeight = (unsigned short) JitConfig.JitAlignLoopMinBlockWeight();
+    opts.compJitAlignLoopBoundary       = (unsigned short)JitConfig.JitAlignLoopBoundary();
+    opts.compJitAlignLoopMinBlockWeight = (unsigned short)JitConfig.JitAlignLoopMinBlockWeight();
 
-    opts.compJitAlignLoopForJcc         = JitConfig.JitAlignLoopForJcc() == 1;
-    opts.compJitAlignLoopMaxCodeSize    = (unsigned short) JitConfig.JitAlignLoopMaxCodeSize();
+    opts.compJitAlignLoopForJcc      = JitConfig.JitAlignLoopForJcc() == 1;
+    opts.compJitAlignLoopMaxCodeSize = (unsigned short)JitConfig.JitAlignLoopMaxCodeSize();
 #else
-    opts.compJitAlignLoopAdaptive = true;
+    opts.compJitAlignLoopAdaptive       = true;
     opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT;
 #endif
 
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 5c05d0f12aa33e..5329934855a9bc 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -9038,20 +9038,20 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
         bool dspGCtbls;       // Display the GC tables
 #endif
 
-        // Default numbers used to perform loop alignment. All the numbers are choosen
-        // based on experimenting with various benchmarks.
+// Default numbers used to perform loop alignment. All the numbers are choosen
+// based on experimenting with various benchmarks.
 
-        // Default minimum loop block weight required to enable loop alignment.
+// Default minimum loop block weight required to enable loop alignment.
 #define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 10
 
-        // By default a loop will be aligned at 32B address boundary to get better
-        // performance as per architecture manuals.
+// By default a loop will be aligned at 32B address boundary to get better
+// performance as per architecture manuals.
 #define DEFAULT_ALIGN_LOOP_BOUNDARY 0x20
 
-        // For non-adaptive loop alignment, by default, only align a loop whose size is
-        // atmost 3 times of 32B chunk. If the loop is bigger than that, it is most
-        // likely the loop code is complicated enough and aligning such loop will not help
-        // much.
+// For non-adaptive loop alignment, by default, only align a loop whose size is
+// atmost 3 times of 32B chunk. If the loop is bigger than that, it is most
+// likely the loop code is complicated enough and aligning such loop will not help
+// much.
 #define DEFAULT_MAX_LOOPSIZE_FOR_ALIGN DEFAULT_ALIGN_LOOP_BOUNDARY * 3
 
 #ifdef DEBUG
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index bb908c5d1de795..26b7b067d5edb8 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -3614,7 +3614,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
     size_t is;
 
 #ifdef DEBUG
-    size_t     beforeAddr = (size_t)*dp;
+    size_t beforeAddr = (size_t)*dp;
 #endif
 
     /* Record the beginning offset of the instruction */
@@ -4534,13 +4534,12 @@ void emitter::emitJumpDistBind()
 #endif // DEBUG
 }
 
-
 //-----------------------------------------------------------------------------
 //  For loopHeaderIg, find the size of the smallest possible loop that doesn't exceed maxLoopSize.
 //
 unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize)
 {
-    unsigned  loopSize     = 0;
+    unsigned loopSize = 0;
 
     for (insGroup* igInLoop = igLoopHeader; igInLoop != nullptr; igInLoop = igInLoop->igNext)
     {
@@ -4594,12 +4593,12 @@ void emitter::emitLoopAlignAdjustments()
     else
     {
         // For non-adaptive, just take whatever is supplied using COMPlus_ variables
-        maxLoopSize       = emitComp->opts.compJitAlignLoopMaxCodeSize;
+        maxLoopSize     = emitComp->opts.compJitAlignLoopMaxCodeSize;
         maxPaddingAdded = alignmentBoundary - 1;
     }
 
     unsigned alignBytesRemoved = 0;
-    unsigned loopSize = 0;
+    unsigned loopSize          = 0;
     for (insGroup* ig = emitIGlist; ig != nullptr; ig = ig->igNext)
     {
         ig->igOffs -= alignBytesRemoved;
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 09eda0c1f0293e..70c1b7855fd65c 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -1742,8 +1742,8 @@ class emitter
     void          emitJumpDistBind(); // Bind all the local jumps in method
 
     unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize); // Get the smallest loop size
-    void     emitSetLoopBackEdge(insGroup* dstIG);
-    void     emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments
+    void emitSetLoopBackEdge(insGroup* dstIG);
+    void emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments
 
     void emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG); // Check for illegal branches between funclets
 
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index cc73445fa74a08..83357372c78c9c 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -9396,7 +9396,7 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, size_t sz, BYTE* dst
         if (displayAlignmentDetails)
         {
             printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n", alignmentBoundary,
-                    emitComp->info.compMethodName);
+                   emitComp->info.compMethodName);
         }
 #endif
     }
@@ -9439,7 +9439,7 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, size_t sz, BYTE* dst
                     if (displayAlignmentDetails)
                     {
                         printf("\t\t;; Skip alignment: 'Loop already aligned at 16B boundary.' in (%s)\n",
-                                emitComp->info.compMethodName);
+                               emitComp->info.compMethodName);
                     }
 #endif
                 }
@@ -9451,9 +9451,9 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, size_t sz, BYTE* dst
                     if (displayAlignmentDetails)
                     {
                         printf("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
-                                "AlignmentBoundary= %dB.' in (%s)\n",
-                                nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary,
-                                emitComp->info.compFullName);
+                               "AlignmentBoundary= %dB.' in (%s)\n",
+                               nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary,
+                               emitComp->info.compFullName);
                     }
 #endif
                 }
@@ -9475,7 +9475,7 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, size_t sz, BYTE* dst
                     if (displayAlignmentDetails)
                     {
                         printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n",
-                                alignmentBoundary, emitComp->info.compMethodName);
+                               alignmentBoundary, emitComp->info.compMethodName);
                     }
 #endif
                 }
@@ -9537,7 +9537,7 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, size_t sz, BYTE* dst
                 if (displayAlignmentDetails)
                 {
                     printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n",
-                            alignmentBoundary, emitComp->info.compMethodName);
+                           alignmentBoundary, emitComp->info.compMethodName);
                 }
 #endif
             }
@@ -9584,7 +9584,7 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, size_t sz, BYTE* dst
         if (displayAlignmentDetails)
         {
             printf("\t\t;; Add alignment: 'Padding= %d, AlignmentBoundary= %dB.' in (%s)\n", paddingToAdd,
-                    alignmentBoundary, emitComp->info.compFullName);
+                   alignmentBoundary, emitComp->info.compFullName);
         }
 #endif
         // In the end dst should be at alignment boundary
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 997dc0e2fcfff2..bddd4201055d5f 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -48,8 +48,8 @@ CONFIG_INTEGER(JitAlignLoopMinBlockWeight,
 CONFIG_INTEGER(JitAlignLoopMaxCodeSize,
                W("JitAlignLoopMaxCodeSize"),
                DEFAULT_MAX_LOOPSIZE_FOR_ALIGN) // For non-adaptive alignment, minimum loop size (in bytes) for which
-                                                // alignment will be done.
-                     // Defaults to 3 blocks of 32 bytes chunks = 96 bytes.
+                                               // alignment will be done.
+                                               // Defaults to 3 blocks of 32 bytes chunks = 96 bytes.
 CONFIG_INTEGER(JitAlignLoopBoundary,
                W("JitAlignLoopBoundary"),
                DEFAULT_ALIGN_LOOP_BOUNDARY) // For non-adaptive alignment, address boundary (power of 2) at which loop
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 237db340b9896c..dda1da66bf40f2 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -8021,7 +8021,8 @@ void Compiler::AddContainsCallAllContainingLoops(unsigned lnum)
     {
         BasicBlock* first = optLoopTable[nestedLoopNum].lpFirst;
         first->bbFlags &= ~BBF_LOOP_ALIGN;
-        JITDUMP("Skip alignment for L%02u that starts at " FMT_BB " because loop has a call.\n", nestedLoopNum, first->bbNum);
+        JITDUMP("Skip alignment for L%02u that starts at " FMT_BB " because loop has a call.\n", nestedLoopNum,
+                first->bbNum);
     }
 }
 

From 599ad439e9c2d702341abc1113fb5afb5b767289 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 4 Dec 2020 18:28:05 -0800
Subject: [PATCH 35/59] Add FEATURE_LOOP_ALIGN

---
 src/coreclr/jit/codegenlinear.cpp | 4 +++-
 src/coreclr/jit/emit.cpp          | 2 +-
 src/coreclr/jit/jit.h             | 4 ++++
 src/coreclr/jit/jitconfigvalues.h | 4 ++++
 src/coreclr/jit/optimizer.cpp     | 2 +-
 5 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index 5af9c5b68f816d..882838a14c2c5d 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -739,6 +739,7 @@ void CodeGen::genCodeForBBlist()
 
             case BBJ_COND:
 
+#ifdef FEATURE_LOOP_ALIGN
                 // This is the last place where we operate on blocks and after this, we operate
                 // on IG. Hence, if we know that the destination of "block" is the first block
                 // of a loop and needs alignment (it has BBF_LOOP_ALIGN), then "block" represents
@@ -751,6 +752,7 @@ void CodeGen::genCodeForBBlist()
                 {
                     GetEmitter()->emitSetLoopBackEdge((insGroup*)block->bbJumpDest->bbEmitCookie);
                 }
+#endif
                 break;
 
             default:
@@ -758,7 +760,7 @@ void CodeGen::genCodeForBBlist()
                 break;
         }
 
-#if defined(TARGET_XARCH)
+#ifdef FEATURE_LOOP_ALIGN
 
         // If next block is the first block of a loop (identified by BBF_LOOP_ALIGN),
         // then need to add align instruction in current "block". Also mark the
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 26b7b067d5edb8..7bd672dcfe16b1 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -3658,7 +3658,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
         // It is fatal to under-estimate the instruction size, except for alignment instructions
         bool validCodeSize = id->idCodeSize() >= csz;
 
-#if defined(TARGET_XARCH)
+#ifdef FEATURE_LOOP_ALIGN
         validCodeSize |= (!emitComp->opts.compJitAlignLoopAdaptive && (id->idIns() == INS_align) &&
                           (emitComp->opts.compJitAlignLoopBoundary > 16));
 #endif
diff --git a/src/coreclr/jit/jit.h b/src/coreclr/jit/jit.h
index 9fb780dbd40c66..62e7ac8059b16d 100644
--- a/src/coreclr/jit/jit.h
+++ b/src/coreclr/jit/jit.h
@@ -747,6 +747,10 @@ class Histogram
 #define CLFLG_STRUCTPROMOTE 0x00000
 #endif
 
+#ifdef TARGET_XARCH
+#define FEATURE_LOOP_ALIGN 1
+#endif
+
 #define CLFLG_MAXOPT                                                                                                   \
     (CLFLG_CSE | CLFLG_REGVAR | CLFLG_RNGCHKOPT | CLFLG_DEADASGN | CLFLG_CODEMOTION | CLFLG_QMARK | CLFLG_TREETRANS |  \
      CLFLG_INLINING | CLFLG_STRUCTPROMOTE | CLFLG_CONSTANTFOLD)
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index bddd4201055d5f..81ecb0e8c52c69 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -223,7 +223,11 @@ CONFIG_INTEGER(EnableIncompleteISAClass, W("EnableIncompleteISAClass"), 0) // En
                                                                            // intrinsic classes
 #endif                                                                     // defined(DEBUG)
 
+#ifdef FEATURE_LOOP_ALIGN
 CONFIG_INTEGER(JitAlignLoops, W("JitAlignLoops"), 1) // If set, align inner loops
+#else
+CONFIG_INTEGER(JitAlignLoops, W("JitAlignLoops"), 0)
+#endif
 
 ///
 /// JIT
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index dda1da66bf40f2..5a0c0cef7683ef 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2586,7 +2586,7 @@ void Compiler::optFindNaturalLoops()
 
 void Compiler::optIdentifyLoopsForAlignment()
 {
-#if defined(TARGET_XARCH)
+#ifdef FEATURE_LOOP_ALIGN
     if (codeGen->ShouldAlignLoops())
     {
         for (unsigned char loopInd = 0; loopInd < optLoopCount; loopInd++)

From 97fd373ebd4af26a6b3bf526aec683b1e2e7591c Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 9 Dec 2020 10:01:15 -0800
Subject: [PATCH 36/59] remove special case for align

---
 src/coreclr/jit/emit.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 7bd672dcfe16b1..259f64a6ee23a9 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -3657,11 +3657,6 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
     {
         // It is fatal to under-estimate the instruction size, except for alignment instructions
         bool validCodeSize = id->idCodeSize() >= csz;
-
-#ifdef FEATURE_LOOP_ALIGN
-        validCodeSize |= (!emitComp->opts.compJitAlignLoopAdaptive && (id->idIns() == INS_align) &&
-                          (emitComp->opts.compJitAlignLoopBoundary > 16));
-#endif
         noway_assert(validCodeSize);
 
 #if DEBUG_EMIT

From 37b0cdbeb31cccd09c8f3755651327dd71ed2639 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Mon, 14 Dec 2020 11:23:55 -0800
Subject: [PATCH 37/59] Do not propagate BBF_LOOP_ALIGN in certain cases

---
 src/coreclr/jit/block.h           |  4 ++++
 src/coreclr/jit/codegenlinear.cpp | 11 +++++++----
 src/coreclr/jit/emit.h            |  7 ++++++-
 src/coreclr/jit/flowgraph.cpp     | 10 ++--------
 4 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h
index 9f16020d93c3a7..d7f49f067fe191 100644
--- a/src/coreclr/jit/block.h
+++ b/src/coreclr/jit/block.h
@@ -464,6 +464,10 @@ struct BasicBlock : private LIR::Range
     {
         return ((bbFlags & BBF_LOOP_HEAD) != 0);
     }
+    bool isLoopAlign() const
+    {
+        return ((bbFlags & BBF_LOOP_ALIGN) != 0);
+    }
 
 // Flags to update when two blocks are compacted
 
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index 882838a14c2c5d..751e327f2a471f 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -349,6 +349,9 @@ void CodeGen::genCodeForBBlist()
             needLabel = true;
         }
 
+        // Make sure we did not add align instruction in the middle of IG.
+        assert(needLabel || !GetEmitter()->emitCurIG->isLoopAlign());
+
         if (needLabel)
         {
             // Mark a label and update the current set of live GC refs
@@ -748,7 +751,7 @@ void CodeGen::genCodeForBBlist()
                 // During emitter, this information will be used to calculate the loop size.
                 // Depending on the loop size, decision of whether to align a loop or not will be taken.
 
-                if (block->bbJumpDest->bbFlags & BBF_LOOP_ALIGN)
+                if (block->bbJumpDest->isLoopAlign())
                 {
                     GetEmitter()->emitSetLoopBackEdge((insGroup*)block->bbJumpDest->bbEmitCookie);
                 }
@@ -764,14 +767,14 @@ void CodeGen::genCodeForBBlist()
 
         // If next block is the first block of a loop (identified by BBF_LOOP_ALIGN),
         // then need to add align instruction in current "block". Also mark the
-        // corresponding IG with IGF_ALIGN_LOOP to know that there will be align
+        // corresponding IG with IGF_LOOP_ALIGN to know that there will be align
         // instructions at the end of that IG.
         //
         // For non-adaptive alignment, add alignment instruction of size depending on the
         // compJitAlignLoopBoundary.
         // For adaptive alignment, alignment instruction will always be of 15 bytes.
 
-        if ((block->bbNext != nullptr) && (block->bbNext->bbFlags & BBF_LOOP_ALIGN))
+        if ((block->bbNext != nullptr) && (block->bbNext->isLoopAlign()))
         {
             assert(ShouldAlignLoops());
 
@@ -786,7 +789,7 @@ void CodeGen::genCodeForBBlist()
 
             // Mark this IG as need alignment so during emitter we can check the instruction count heuristics of
             // all IGs that follows this IG and participate in a loop.
-            GetEmitter()->emitCurIG->igFlags |= IGF_ALIGN_LOOP;
+            GetEmitter()->emitCurIG->igFlags |= IGF_LOOP_ALIGN;
 
             JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u to align loop header block.\n" FMT_BB,
                     compiler->opts.compJitAlignLoopBoundary, compiler->compMethodID, GetEmitter()->emitCurIG->igNum);
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 70c1b7855fd65c..5057554aad1899 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -265,7 +265,7 @@ struct insGroup
 #define IGF_PLACEHOLDER 0x0100    // this is a placeholder group, to be filled in later
 #define IGF_EXTEND 0x0200         // this block is conceptually an extension of the previous block
                                   // and the emitter should continue to track GC info as if there was no new block.
-#define IGF_ALIGN_LOOP 0x0400     // this group contains alignment instruction at the end because the next IG points
+#define IGF_LOOP_ALIGN 0x0400     // this group contains alignment instruction at the end because the next IG points
                                   // to inner loop that needs alignment.
 
 // Mask of IGF_* flags that should be propagated to new blocks when they are created.
@@ -339,6 +339,11 @@ struct insGroup
         return *(unsigned*)ptr;
     }
 
+    bool isLoopAlign()
+    {
+        return (igFlags & IGF_LOOP_ALIGN) != 0;
+    }
+
 }; // end of struct insGroup
 
 //  For AMD64 the maximum prolog/epilog size supported on the OS is 256 bytes
diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
index 0d7a996f84a422..bcceb7d2763a6c 100644
--- a/src/coreclr/jit/flowgraph.cpp
+++ b/src/coreclr/jit/flowgraph.cpp
@@ -9644,7 +9644,7 @@ BasicBlock* Compiler::fgSplitBlockAtEnd(BasicBlock* curr)
     // Remove flags that the new block can't have.
     newBlock->bbFlags &=
         ~(BBF_TRY_BEG | BBF_LOOP_HEAD | BBF_LOOP_CALL0 | BBF_LOOP_CALL1 | BBF_HAS_LABEL | BBF_JMP_TARGET |
-          BBF_FUNCLET_BEG | BBF_LOOP_PREHEADER | BBF_KEEP_BBJ_ALWAYS | BBF_PATCHPOINT | BBF_BACKWARD_JUMP_TARGET);
+          BBF_FUNCLET_BEG | BBF_LOOP_PREHEADER | BBF_KEEP_BBJ_ALWAYS | BBF_PATCHPOINT | BBF_BACKWARD_JUMP_TARGET | BBF_LOOP_ALIGN);
 
     // Remove the GC safe bit on the new block. It seems clear that if we split 'curr' at the end,
     // such that all the code is left in 'curr', and 'newBlock' just gets the control flow, then
@@ -10946,12 +10946,6 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext)
             break;
     }
 
-    // Add the LOOP_ALIGN flag, if applicable
-    if (bNext->bbFlags & BBF_LOOP_ALIGN)
-    {
-        block->bbFlags |= BBF_LOOP_ALIGN;
-    }
-
     // If we're collapsing a block created after the dominators are
     // computed, copy block number the block and reuse dominator
     // information from bNext to block.
@@ -11553,7 +11547,7 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable)
             skipUnmarkLoop = true;
         }
 
-        if (block->bbFlags & BBF_LOOP_ALIGN)
+        if (block->isLoopAlign())
         {
             succBlock->bbFlags |= BBF_LOOP_ALIGN;
         }

From c0cc8af0188ded7d2a634bf88a9c6f7b1c1ee0a8 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Mon, 14 Dec 2020 20:17:48 -0800
Subject: [PATCH 38/59] Introduce instrDescAlign and emitLastAlignedIgNum

---
 src/coreclr/jit/emit.cpp      |  6 +++++
 src/coreclr/jit/emit.h        | 23 ++++++++++++++++++
 src/coreclr/jit/emitxarch.cpp | 44 +++++++++++++++++++++--------------
 3 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 259f64a6ee23a9..adbe7db12bd1f3 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -160,6 +160,8 @@ unsigned emitter::emitSmallCnsCnt;
 unsigned emitter::emitLargeCnsCnt;
 unsigned emitter::emitSmallCns[SMALL_CNS_TSZ];
 
+unsigned emitter::emitTotalDescAlignCnt;
+
 void emitterStaticStats(FILE* fout)
 {
     // insGroup members
@@ -387,6 +389,9 @@ void emitterStats(FILE* fout)
         fprintf(fout, "Total instrDescReloc:  %8u (%5.2f%%)\n", emitter::emitTotalIDescRelocCnt,
                 100.0 * emitter::emitTotalIDescRelocCnt / emitter::emitTotalInsCnt);
 #endif // TARGET_ARM
+        fprintf(fout, "Total emitTotalDescAlignCnt:  %8u (%5.2f%%)\n", emitter::emitTotalDescAlignCnt,
+                100.0 * emitter::emitTotalDescAlignCnt / emitter::emitTotalInsCnt);
+
         fprintf(fout, "\n");
     }
 
@@ -932,6 +937,7 @@ void emitter::emitBegFN(bool hasFramePtr
 
     emitCurIGfreeBase = nullptr;
     emitIGbuffSize    = 0;
+    emitLastAlignedIgNum = 0;
 
     /* Record stack frame info (the temp size is just an estimate) */
 
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 5057554aad1899..5c33739d28bbc5 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -1369,6 +1369,12 @@ class emitter
                                   // hot to cold and cold to hot jumps)
     };
 
+    struct instrDescAlign : instrDesc
+    {
+        instrDescAlign* idaNext; // next align in the group/method
+        insGroup*     idaIG;   // containing group
+    };
+
 #if !defined(TARGET_ARM64) // This shouldn't be needed for ARM32, either, but I don't want to touch the ARM32 JIT.
     struct instrDescLbl : instrDescJmp
     {
@@ -1990,6 +1996,14 @@ class emitter
         return (instrDescCGCA*)emitAllocAnyInstr(sizeof(instrDescCGCA), attr);
     }
 
+    instrDescAlign* emitAllocInstrAlign()
+    {
+#if EMITTER_STATS
+        emitTotalIDescJmpCnt++;
+#endif // EMITTER_STATS
+        return (instrDescAlign*)emitAllocAnyInstr(sizeof(instrDescAlign), EA_1BYTE);
+    }
+
     instrDesc* emitNewInstrSmall(emitAttr attr);
     instrDesc* emitNewInstr(emitAttr attr = EA_4BYTE);
     instrDesc* emitNewInstrSC(emitAttr attr, cnsval_ssize_t cns);
@@ -2005,6 +2019,7 @@ class emitter
     instrDescLbl* emitNewInstrLbl();
 #endif // !TARGET_ARM64
 
+    instrDescAlign*   emitNewInstrAlign();
     static const BYTE emitFmtToOps[];
 
 #ifdef DEBUG
@@ -2311,6 +2326,7 @@ class emitter
 #define SMALL_CNS_TSZ 256
     static unsigned emitSmallCns[SMALL_CNS_TSZ];
     static unsigned emitLargeCnsCnt;
+    static unsigned emitTotalDescAlignCnt;
 
     static unsigned emitIFcounts[IF_COUNT];
 
@@ -2513,6 +2529,13 @@ inline emitter::instrDescJmp* emitter::emitNewInstrJmp()
     return emitAllocInstrJmp();
 }
 
+inline emitter::instrDescAlign* emitter::emitNewInstrAlign()
+{
+    instrDescAlign* newInstr = emitAllocInstrAlign();
+    newInstr->idIns(INS_align);
+    return newInstr;
+}
+
 #if !defined(TARGET_ARM64)
 inline emitter::instrDescLbl* emitter::emitNewInstrLbl()
 {
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 83357372c78c9c..96ff299624546d 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2657,15 +2657,24 @@ emitter::instrDesc* emitter::emitNewInstrAmdCns(emitAttr size, ssize_t dsp, int
 //  So insert a dummy instruction here to ensure that
 //  the x86 I-cache alignment rule is followed.
 //
-void emitter::emitLoopAlign()
+void emitter::emitLoopAlign(unsigned short paddingBytes)
 {
     /* Insert a pseudo-instruction to ensure that we align
        the next instruction properly */
 
-    instrDesc* id = emitNewInstrSmall(EA_1BYTE);
-    id->idIns(INS_align);
-    id->idCodeSize(15); // We may need to skip up to 15 bytes of code
-    emitCurIGsize += 15;
+    paddingBytes = min(paddingBytes, 15);  // We may need to skip up to 15 bytes of code
+    instrDescAlign* id = emitNewInstrAlign();
+    id->idCodeSize(paddingBytes);
+    emitCurIGsize += paddingBytes;
+
+    id->idaIG = emitCurIG;
+
+    /* Append this instruction to this IG's jump list */
+    id->idaNext = emitCurIGAlignList;
+    emitCurIGAlignList = id;
+
+    /* Record the last IG that has align instruction */
+    emitLastAlignedIgNum = emitCurIG->igNum;
 }
 
 //-----------------------------------------------------------------------------
@@ -2681,7 +2690,7 @@ void emitter::emitLongLoopAlign(unsigned short alignmentBoundary)
 {
     unsigned short nPaddingBytes    = alignmentBoundary - 1;
     unsigned short nAlignInstr      = (nPaddingBytes + (15 - 1)) / 15;
-    unsigned short instrDescSize    = nAlignInstr * SMALL_IDSC_SIZE;
+    unsigned short instrDescSize    = nAlignInstr * sizeof(instrDescAlign);
     unsigned short insAlignCount    = nPaddingBytes / 15;
     unsigned short lastInsAlignSize = nPaddingBytes % 15;
 
@@ -2691,22 +2700,15 @@ void emitter::emitLongLoopAlign(unsigned short alignmentBoundary)
         emitForceNewIG = true;
     }
 
+    /* Insert a pseudo-instruction to ensure that we align
+    the next instruction properly */
+
     while (insAlignCount)
     {
         emitLoopAlign();
         insAlignCount--;
     }
-
-    /* Insert a pseudo-instruction to ensure that we align
-       the next instruction properly */
-
-    if (lastInsAlignSize > 0)
-    {
-        instrDesc* id = emitNewInstrSmall(EA_1BYTE);
-        id->idIns(INS_align);
-        id->idCodeSize(lastInsAlignSize);
-        emitCurIGsize += lastInsAlignSize;
-    }
+    emitLoopAlign(lastInsAlignSize);
 }
 
 /*****************************************************************************
@@ -7381,6 +7383,12 @@ size_t emitter::emitSizeOfInsDsc(instrDesc* id)
     switch (idOp)
     {
         case ID_OP_NONE:
+#ifdef FEATURE_LOOP_ALIGN
+            if (id->idIns() == INS_align)
+            {
+                return sizeof(instrDescAlign);
+            }
+#endif
             break;
 
         case ID_OP_LBL:
@@ -12875,8 +12883,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             // the loop alignment pseudo instruction
             if (ins == INS_align)
             {
-                sz  = SMALL_IDSC_SIZE;
                 dst = emitOutputAlign(ig, id, sz, dst);
+                sz  = sizeof(instrDescAlign);
                 break;
             }
 

From 26f7e6176c205f31412d362b745aece86baad534 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Tue, 15 Dec 2020 15:57:04 -0800
Subject: [PATCH 39/59] Several changes:

- Perform accurate padding size before outputting align instruction
- During outputting, just double check if the padding needed matches to what was calculated.
- If at any time, instruction sizes are over-estimated before the last align instruction,
  then compensate them by adding NOP.
- As part of above step, do not perform encoding "VEX prefix shortening" if there is align
  instruction in future.
- Fix edge cases where because of loop cloning or resolution phase of register allocator, the
  loops are marked such that they cover the loops that are already mark for alignment. Fix by
  resetting their IGF_LOOP_ALIGN flag.
- During loop size calculation, if the last IG also has `align` flag, then do not take into account
  the align instruction's size because they are reserved for the next loop.
---
 src/coreclr/jit/codegencommon.cpp |   2 +
 src/coreclr/jit/codegenlinear.cpp |   4 +-
 src/coreclr/jit/emit.cpp          | 481 +++++++++++++++++++++++++++---
 src/coreclr/jit/emit.h            |  17 +-
 src/coreclr/jit/emitxarch.cpp     | 290 ++++--------------
 src/coreclr/jit/emitxarch.h       |   4 +-
 src/coreclr/jit/optimizer.cpp     |   9 +-
 7 files changed, 534 insertions(+), 273 deletions(-)

diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index c7ecda8fb17721..a2c9f4a4b08c45 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -2258,9 +2258,11 @@ void CodeGen::genGenerateMachineCode()
 
     GetEmitter()->emitJumpDistBind();
 
+#ifdef FEATURE_LOOP_ALIGN
     /* Perform alignment adjustments */
 
     GetEmitter()->emitLoopAlignAdjustments();
+#endif
 
     /* The code is now complete and final; it should not change after this. */
 }
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index 751e327f2a471f..d06479c3de7d91 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -791,8 +791,8 @@ void CodeGen::genCodeForBBlist()
             // all IGs that follows this IG and participate in a loop.
             GetEmitter()->emitCurIG->igFlags |= IGF_LOOP_ALIGN;
 
-            JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u to align loop header block.\n" FMT_BB,
-                    compiler->opts.compJitAlignLoopBoundary, compiler->compMethodID, GetEmitter()->emitCurIG->igNum);
+            JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u (" FMT_BB ") to align loop header block (" FMT_BB ").\n",
+                    compiler->opts.compJitAlignLoopBoundary, compiler->compMethodID, GetEmitter()->emitCurIG->igNum, block->bbNum, block->bbNext->bbNum);
         }
 #endif
 
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index adbe7db12bd1f3..148e3f70719292 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -641,6 +641,10 @@ void emitter::emitGenIG(insGroup* ig)
 
     assert(emitCurIGjmpList == nullptr);
 
+#ifdef FEATURE_LOOP_ALIGN
+    assert(emitCurIGAlignList == nullptr);
+#endif
+
     /* Allocate the temp instruction buffer if we haven't done so */
 
     if (emitCurIGfreeBase == nullptr)
@@ -827,6 +831,61 @@ insGroup* emitter::emitSavIG(bool emitAdd)
     }
 #endif
 
+#ifdef FEATURE_LOOP_ALIGN
+    // Did we have any align instructions in this group?
+    if (emitCurIGAlignList)
+    {
+        instrDescAlign* list = nullptr;
+        instrDescAlign* last = nullptr;
+
+        // Move align instructions to the global list, update their 'next' links
+        do
+        {
+            // Grab the jump and remove it from the list
+
+            instrDescAlign* oa = emitCurIGAlignList;
+            emitCurIGAlignList = oa->idaNext;
+
+            // Figure out the address of where the align got copied
+
+            size_t of = (BYTE*)oa - emitCurIGfreeBase;
+            instrDescAlign* na = (instrDescAlign*)(ig->igData + of);
+
+            assert(na->idaIG == ig);
+            assert(na->idIns() == oa->idIns());
+            assert(na->idaNext == oa->idaNext);
+            assert(na->idIns() == INS_align);
+
+            na->idaNext = list;
+            list        = na;
+
+            if (last == nullptr)
+            {
+                last = na;
+            }
+        } while (emitCurIGAlignList);
+
+        // Should have at least one align instruction
+        assert(last);
+
+        if (emitAlignList == nullptr)
+        {
+            assert(emitAlignLast == nullptr);
+
+            last->idaNext = emitAlignList;
+            emitAlignList = list;
+        }
+        else
+        {
+            last->idaNext = nullptr;
+            emitAlignLast->idaNext = list;
+        }
+
+        emitAlignLast = last;
+        assert(emitAlignLast->idaIG->igNum == emitLastAlignedIgNum);
+    }
+
+#endif
     // Did we have any jumps in this group?
 
     if (emitCurIGjmpList)
@@ -937,7 +996,12 @@ void emitter::emitBegFN(bool hasFramePtr
 
     emitCurIGfreeBase = nullptr;
     emitIGbuffSize    = 0;
+
+#ifdef FEATURE_LOOP_ALIGN
     emitLastAlignedIgNum = 0;
+    emitLastInnerLoopStartIgNum = 0;
+    emitLastInnerLoopEndIgNum = 0;
+#endif
 
     /* Record stack frame info (the temp size is just an estimate) */
 
@@ -974,6 +1038,14 @@ void emitter::emitBegFN(bool hasFramePtr
     emitNoGCIG     = false;
     emitForceNewIG = false;
 
+
+#ifdef FEATURE_LOOP_ALIGN
+    /* We don't have any align instructions */
+
+    emitAlignList = emitAlignLast = nullptr;
+    emitCurIGAlignList            = nullptr;
+#endif
+
     /* We have not recorded any live sets */
 
     assert(VarSetOps::IsEmpty(emitComp, emitThisGCrefVars));
@@ -3657,18 +3729,23 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
 
     /* Did the size of the instruction match our expectations? */
 
-    UNATIVE_OFFSET csz = (UNATIVE_OFFSET)(*dp - curInsAdr);
+    UNATIVE_OFFSET actualSize = (UNATIVE_OFFSET)(*dp - curInsAdr);
 
-    if (csz != id->idCodeSize())
+    unsigned estimatedSize = id->idCodeSize();
+    if (actualSize != estimatedSize)
     {
         // It is fatal to under-estimate the instruction size, except for alignment instructions
-        bool validCodeSize = id->idCodeSize() >= csz;
-        noway_assert(validCodeSize);
+        noway_assert(estimatedSize >= actualSize);
+
+#ifdef FEATURE_LOOP_ALIGN
+        // Should never over-estimate align instruction or any instruction before the last align instruction of a method
+        assert(id->idIns() != INS_align && emitCurIG->igNum > emitLastAlignedIgNum);
+#endif
 
 #if DEBUG_EMIT
         if (EMITVERBOSE)
         {
-            printf("Instruction predicted size = %u, actual = %u\n", id->idCodeSize(), csz);
+            printf("Instruction predicted size = %u, actual = %u\n", estimatedSize, actualSize);
         }
 #endif // DEBUG_EMIT
 
@@ -3676,7 +3753,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
 
         ig->igFlags |= IGF_UPD_ISZ;
 #if defined(TARGET_XARCH)
-        id->idCodeSize(csz);
+        id->idCodeSize(actualSize);
 #elif defined(TARGET_ARM)
 // This is done as part of emitSetShortJump();
 // insSize isz = emitInsSize(id->idInsFmt());
@@ -4535,6 +4612,9 @@ void emitter::emitJumpDistBind()
 #endif // DEBUG
 }
 
+
+#ifdef FEATURE_LOOP_ALIGN
+
 //-----------------------------------------------------------------------------
 //  For loopHeaderIg, find the size of the smallest possible loop that doesn't exceed maxLoopSize.
 //
@@ -4545,6 +4625,26 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize)
     for (insGroup* igInLoop = igLoopHeader; igInLoop != nullptr; igInLoop = igInLoop->igNext)
     {
         loopSize += igInLoop->igSize;
+        if (igInLoop->isLoopAlign())
+        {
+            // If igInLoop's next IG is a loop and needs alignment, then igInLoop should be the last IG
+            // of the current loop and should have backedge to current loop header.
+            assert(igInLoop->igLoopBackEdge == igLoopHeader);
+
+            // In such cases, the current loop size should exclude the align instruction size reserved for
+            // next loop.
+            unsigned maxPaddingAllowed;
+            if (emitComp->opts.compJitAlignLoopAdaptive)
+            {
+                maxPaddingAllowed = (emitComp->opts.compJitAlignLoopBoundary >> 1) - 1;
+            }
+            else
+            {
+                maxPaddingAllowed = emitComp->opts.compJitAlignLoopBoundary - 1;
+            }
+
+            loopSize -= maxPaddingAllowed;
+        }
         if ((igInLoop->igLoopBackEdge == igLoopHeader) || (loopSize > maxLoopSize))
         {
             break;
@@ -4555,80 +4655,386 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize)
 }
 
 //-----------------------------------------------------------------------------
-// emitCurIG jumps back to dstIG forming a loop. Set appropriate field to
-// record that information
+// emitSetLoopBackEdge : Sets igLoopBackEdge field, if not already set and
+//                       if currIG has back-edge to dstIG.
+//
+// Notes:
+//    If the current loop covers a loop that is already marked as align, then remove
+//    the alignment flag present on IG before dstIG.
 //
 void emitter::emitSetLoopBackEdge(insGroup* dstIG)
 {
-    // Only track back edges to the loop.
-    // Here dstIG != nullptr checks if we have already generated dstIG for a block.
-    // If block->bbJumpDest was a forward block, it might have not been created yet.
+    // With (dstIG != nullptr), ensure that only back edges are tracked.
+    // If there is forward jump, dstIG is not yet generated.
+    //
     // We don't rely on (block->bbJumpDest->bbNum <= block->bbNum) because the basic
     // block numbering is not guaranteed to be sequential.
-    if ((dstIG != nullptr) && (dstIG->igNum <= emitCurIG->igNum))
+
+    if ((dstIG != nullptr) && (dstIG->igNum <= emitCurIG->igNum) && (emitCurIG->igLoopBackEdge == nullptr))
     {
-        emitCurIG->igLoopBackEdge = dstIG;
+        unsigned currLoopStart = dstIG->igNum;
+        unsigned currLoopEnd   = emitCurIG->igNum;
+
+        // Only mark back-edge if current loop starts after the last inner loop ended.
+        if (emitLastInnerLoopEndIgNum < currLoopStart)
+        {
+            emitCurIG->igLoopBackEdge = dstIG;
+
+            JITDUMP("** IG%02u jumps back to IG%02u forming a loop.\n", currLoopEnd, currLoopStart);
+
+            emitLastInnerLoopStartIgNum = currLoopStart;
+            emitLastInnerLoopEndIgNum   = currLoopEnd;
+        }
+        // Otherwise, mark the dstIG->prevIG as no alignment needed.
+        //
+        // Note: If current loop's back-edge target is same as emitLastInnerLoopStartIgNum,
+        // retain the alignment flag of dstIG->prevIG so the loop
+        // (emitLastInnerLoopStartIgNum ~ emitLastInnerLoopEndIgNum) is still aligned.
+        else if (emitLastInnerLoopStartIgNum != currLoopStart)
+        {
+            // Find the IG before dstIG...
+            instrDescAlign* alignInstr = emitAlignList;
+            while ((alignInstr != nullptr) && (alignInstr->idaIG->igNext != dstIG))
+            {
+                alignInstr = alignInstr->idaNext;
+            }
+
+            // ...and clear the IGF_LOOP_ALIGN flag
+            if (alignInstr != nullptr)
+            {
+                assert(alignInstr->idaIG->igNext == dstIG);
+                alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN;
+            }
 
-        JITDUMP("** IG_%d jumps back to IG_%d forming a loop.\n", emitCurIG->igNum, dstIG->igNum);
+            JITDUMP("** Skip alignment for loop IG%02u ~ IG%02u, because it covers an aligned loop IG%02u ~ IG%02u.\n",
+                    currLoopStart, currLoopEnd, emitLastInnerLoopStartIgNum, emitLastInnerLoopEndIgNum);
+        }
     }
 }
 
 //-----------------------------------------------------------------------------
-//  For IGs that adds padding to align loops, calculate the loop size and if it exceed the
-//  threshold, then mark that alignment is not needed and hence adjust the igOffs, igSize
-//  and emitTotalCodeSize.
+//  emitLoopAlignAdjustments: Walk all the align instructions and update them
+//    with actual padding needed.
+
+//  Notes:
+//     For IGs that have align instructions in the end, calculate the actual offset
+//     of loop start and determine how much padding is needed. Based on that, update
+//     the igOffs, igSize and emitTotalCodeSize.
 //
 void emitter::emitLoopAlignAdjustments()
 {
-#ifdef TARGET_XARCH
+    // no align instructions
+    if (emitAlignList == nullptr)
+    {
+        return;
+    }
 
-    unsigned short maxPaddingAdded, alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
+    unsigned short estimatedPaddingNeeded, alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
     unsigned       maxLoopSize = 0;
+
     if (emitComp->opts.compJitAlignLoopAdaptive)
     {
         // For adaptive, adjust the loop size depending on the alignment boundary
         int maxBlocksAllowedForLoop = genLog2((unsigned)alignmentBoundary) - 1;
         maxLoopSize                 = alignmentBoundary * maxBlocksAllowedForLoop;
-        maxPaddingAdded             = (alignmentBoundary >> 1) - 1;
+        estimatedPaddingNeeded      = (alignmentBoundary >> 1) - 1;
     }
     else
     {
         // For non-adaptive, just take whatever is supplied using COMPlus_ variables
         maxLoopSize     = emitComp->opts.compJitAlignLoopMaxCodeSize;
-        maxPaddingAdded = alignmentBoundary - 1;
+        estimatedPaddingNeeded = alignmentBoundary - 1;
     }
 
     unsigned alignBytesRemoved = 0;
     unsigned loopSize          = 0;
-    for (insGroup* ig = emitIGlist; ig != nullptr; ig = ig->igNext)
+    unsigned loopIGOffset      = 0;
+    instrDescAlign* alignInstr = emitAlignList;
+
+    // track the IG that was adjusted so we can update the offsets
+    insGroup* lastIGAdj = emitAlignList->idaIG;
+
+    for (; alignInstr != nullptr; alignInstr = alignInstr->idaNext)
     {
-        ig->igOffs -= alignBytesRemoved;
+        assert(alignInstr->idIns() == INS_align);
 
-        if (!(ig->igFlags & IGF_ALIGN_LOOP))
+        insGroup* alignIG = alignInstr->idaIG;
+
+        // Adjust offsets of all IGs until the current IG
+        while (lastIGAdj->igNum <= alignIG->igNum)
         {
-            continue;
+            lastIGAdj->igOffs -= alignBytesRemoved;
+            lastIGAdj = lastIGAdj->igNext;
         }
 
-        unsigned loopSize = getLoopSize(ig->igNext, maxLoopSize);
-        if (loopSize > maxLoopSize)
-        {
-            assert(ig->igSize >= maxPaddingAdded);
+        loopIGOffset = alignIG->igOffs + alignIG->igSize;
+
+        // igSize also includes INS_align instruction, take it off.
+        loopIGOffset -= estimatedPaddingNeeded;
+
+        // IG can be marked as not needing alignment if during setting igLoopBackEdge, it is detected
+        // that the igLoopBackEdge covers an IG that is marked for alignment.
+        unsigned actualPaddingNeeded =
+            alignIG->isLoopAlign() ? emitCalculatePaddingForLoopAlignment(alignIG, loopIGOffset DEBUG_ARG(false)) : 0;
+
+        assert(estimatedPaddingNeeded >= actualPaddingNeeded);
 
-            ig->igSize -= maxPaddingAdded;
-            alignBytesRemoved += maxPaddingAdded;
-            emitTotalCodeSize -= maxPaddingAdded;
+        unsigned short diff = (unsigned short)(estimatedPaddingNeeded - actualPaddingNeeded);
+
+        if (diff != 0)
+        {
+            alignIG->igSize -= diff;
+            alignBytesRemoved += diff;
+            emitTotalCodeSize -= diff;
 
             // Update the flags
-            ig->igFlags |= IGF_UPD_ISZ;
-            ig->igFlags &= ~IGF_ALIGN_LOOP;
+            alignIG->igFlags |= IGF_UPD_ISZ;
+            if (actualPaddingNeeded == 0)
+            {
+                alignIG->igFlags &= ~IGF_LOOP_ALIGN;
+            }
+
+            if (emitComp->opts.compJitAlignLoopAdaptive)
+            {
+                assert(actualPaddingNeeded < 15);
+                alignInstr->idCodeSize(actualPaddingNeeded);
+            }
+            else
+            {
+                unsigned paddingToAdj     = actualPaddingNeeded;
+
+#ifdef DEBUG
+                
+                int instrAdjusted = (alignmentBoundary + 14) / 15;
+#endif
+                // Adjust the padding amount in all align instructions in this IG
+                instrDescAlign *alignInstrToAdj = alignInstr, *prevAlignInstr = nullptr; 
+                for (; alignInstrToAdj != nullptr && alignInstrToAdj->idaIG == alignInstr->idaIG;
+                     alignInstrToAdj = alignInstrToAdj->idaNext)
+                {
+                    unsigned newPadding = min(paddingToAdj, 15);
+                    alignInstrToAdj->idCodeSize(newPadding);
+                    paddingToAdj -= newPadding;
+                    prevAlignInstr = alignInstrToAdj;
+#ifdef DEBUG
+                    instrAdjusted--;
+#endif
+                }
+                assert(paddingToAdj == 0);
+                assert(instrAdjusted == 0);
+
+                // fast forward the align instruction to next IG
+                alignInstr = prevAlignInstr;
+            }
 
-            JITDUMP("Removed loop alignment from G_M%03u_IG%02u: 'LoopSize= %d, MaxLoopSize= %d\n",
-                    emitComp->compMethodID, ig->igNum, loopSize, maxLoopSize);
+            JITDUMP("Adjusted alignment of G_M%03u_IG%02u from %02d to %02d\n", emitComp->compMethodID, alignIG->igNum,
+                   estimatedPaddingNeeded, actualPaddingNeeded);
         }
     }
+
+    // Do adjustments of remaining IGs
+    while (lastIGAdj != nullptr)
+    {
+        lastIGAdj->igOffs -= alignBytesRemoved;
+        lastIGAdj = lastIGAdj->igNext;
+    }
+
+#ifdef DEBUG
+    emitCheckIGoffsets();
 #endif
 }
 
+//-----------------------------------------------------------------------------
+//  emitCalculatePaddingForLoopAlignment: Calculate the padding to insert at the
+//    end of 'ig' so the loop that starts after 'ig' is aligned.
+//
+//  Returns: Padding amount.
+//    0 means no padding is needed, either because loop is already aligned or it
+//    is too expensive to align loop and hence it will not be aligned.
+//
+//  Notes:
+//     Below are the steps (in this order) to calculate the padding amount.
+//     1. If loop is already aligned to desired boundary, then return 0. // already aligned
+//     2. If loop size exceed maximum allowed loop size, then return 0.  // already aligned
+//
+// For adaptive loop alignment:
+//     3a. Calculate paddingNeeded and maxPaddingAmount to align to 32B boundary.
+//     3b. If paddingNeeded > maxPaddingAmount, then recalculate to align to 16B boundary.
+//     3b. If paddingNeeded == 0, then return 0. // already aligned at 16B
+//     3c. If paddingNeeded > maxPaddingAmount, then return 0. // expensive to align
+//     3d. If the loop already fits in minimum 32B blocks, then return 0. // already best aligned
+//     3e. return paddingNeeded.
+//
+// For non-adaptive loop alignment:
+//     3a. Calculate paddingNeeded.
+//     3b. If the loop already fits in minimum alignmentBoundary blocks, then return 0. // already best aligned
+//     3c. return paddingNeeded.
+//
+unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool displayAlignmentDetails))
+{
+    assert(ig->isLoopAlign());
+    unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
+
+    // No padding if loop is already aligned
+    if ((offset & (alignmentBoundary - 1)) == 0)
+    {
+#if DEBUG
+        if (displayAlignmentDetails)
+        {
+            printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n", alignmentBoundary,
+                   emitComp->info.compMethodName);
+        }
+#endif
+        return 0;
+    }
+
+    unsigned       maxLoopSize = 0;
+    int            maxLoopBlocksAllowed = 0;
+
+    if (emitComp->opts.compJitAlignLoopAdaptive)
+    {
+        // For adaptive, adjust the loop size depending on the alignment boundary
+        maxLoopBlocksAllowed = genLog2((unsigned)alignmentBoundary) - 1;
+        maxLoopSize          = alignmentBoundary * maxLoopBlocksAllowed;
+    }
+    else
+    {
+        // For non-adaptive, just take whatever is supplied using COMPlus_ variables
+        maxLoopSize     = emitComp->opts.compJitAlignLoopMaxCodeSize;
+    }
+
+    unsigned loopSize = getLoopSize(ig->igNext, maxLoopSize);
+
+    // No padding if loop is big
+    if (loopSize > maxLoopSize)
+    {
+        return 0;
+    }
+
+    unsigned paddingToAdd           = 0;
+    unsigned minBlocksNeededForLoop = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
+    bool     skipPadding            = false;
+
+    if (emitComp->opts.compJitAlignLoopAdaptive)
+    {
+        // adaptive loop alignment
+        unsigned nMaxPaddingBytes       = (1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1)) - 1;
+        unsigned nPaddingBytes          = (-(int)(size_t)offset) & (alignmentBoundary - 1);
+
+        // Check if the alignment exceeds maxPadding limit
+        if (nPaddingBytes > nMaxPaddingBytes)
+        {
+            // Cannot align to 32B, so try to align to 16B boundary.
+            alignmentBoundary >>= 1;
+            nMaxPaddingBytes = 1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1);
+            nPaddingBytes    = (-(int)(size_t)offset) & (alignmentBoundary - 1);
+
+            // Check if the loop is already at new alignment boundary
+            if (nPaddingBytes == 0)
+            {
+                skipPadding = true;
+#if DEBUG
+                if (displayAlignmentDetails)
+                {
+                    printf("\t\t;; Skip alignment: 'Loop already aligned at 16B boundary.' in (%s)\n",
+                           emitComp->info.compMethodName);
+                }
+#endif
+            }
+            // Check if the alignment exceeds new maxPadding limit
+            else if (nPaddingBytes > nMaxPaddingBytes)
+            {
+                skipPadding = true;
+#if DEBUG
+                if (displayAlignmentDetails)
+                {
+                    printf("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
+                           "AlignmentBoundary= %dB.' in (%s)\n",
+                           nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary, emitComp->info.compFullName);
+                }
+#endif
+            }
+        }
+
+        // If within maxPaddingLimit
+        if (!skipPadding)
+        {
+            // Padding is needed only if loop starts at or after the current offset.
+            // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+            size_t extraBytesNotInLoop =
+                (size_t)(emitComp->opts.compJitAlignLoopBoundary * minBlocksNeededForLoop) - loopSize;
+            size_t currentOffset = (size_t)offset % alignmentBoundary;
+
+            if (currentOffset > extraBytesNotInLoop)
+            {
+                // Padding is needed only if loop starts at or after the current offset and hence might not
+                // fit in minBlocksNeededForLoop
+                paddingToAdd = nPaddingBytes;
+            }
+            else
+            {
+                // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+#if DEBUG
+                if (displayAlignmentDetails)
+                {
+                    printf("\t\t;; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.' in (%s)\n",
+                           minBlocksNeededForLoop, alignmentBoundary, emitComp->info.compMethodName);
+                }
+#endif
+            }
+        }
+    }
+    else
+    {
+        // non-adaptive loop alignment
+        unsigned extraBytesNotInLoop = (alignmentBoundary * minBlocksNeededForLoop) - loopSize;
+        unsigned currentOffset       = (size_t)offset % alignmentBoundary;
+
+#ifdef DEBUG
+        // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary
+        if (emitComp->opts.compJitAlignLoopForJcc)
+        {
+            // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing?
+            currentOffset++;
+        }
+#endif
+
+        if (currentOffset > extraBytesNotInLoop)
+        {
+            // Padding is needed only if loop starts at or after the current offset and hence might not
+            // fit in minBlocksNeededForLoop
+            paddingToAdd = (-(int)(size_t)offset) & (alignmentBoundary - 1);
+        }
+        else
+        {
+            // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+#if DEBUG
+            if (displayAlignmentDetails)
+            {
+                printf("\t\t;; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.' in (%s)\n",
+                       minBlocksNeededForLoop, alignmentBoundary, emitComp->info.compMethodName);
+            }
+#endif
+        }
+    }
+
+#if DEBUG
+    if (displayAlignmentDetails && paddingToAdd > 0)
+    {
+        printf("\t\t;; Add alignment: 'Padding= %d, AlignmentBoundary= %dB.' in (%s)\n", paddingToAdd,
+               alignmentBoundary, emitComp->info.compFullName);
+    }
+#endif
+
+    // Either no padding is added because it is too expensive or the offset gets aligned
+    // to the alignment boundary
+    assert(paddingToAdd == 0 || (((offset + paddingToAdd) & (alignmentBoundary - 1)) == 0));
+
+    return paddingToAdd;
+}
+
+#endif
+
 void emitter::emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG)
 {
 #ifdef DEBUG
@@ -5532,6 +5938,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
 #endif
 
     unsigned actualCodeSize = emitCurCodeOffs(cp);
+    assert(emitTotalCodeSize >= actualCodeSize);
 
 #if EMITTER_STATS
     totAllocdSize += emitTotalCodeSize;
@@ -5541,7 +5948,7 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
     // Fill in eventual unused space, but do not report this space as used.
     // If you add this padding during the emitIGlist loop, then it will
     // emit offsets after the loop with wrong value (for example for GC ref variables).
-    unsigned unusedSize = emitTotalCodeSize - emitCurCodeOffs(cp);
+    unsigned unusedSize = emitTotalCodeSize - actualCodeSize;
     for (unsigned i = 0; i < unusedSize; ++i)
     {
         *cp++ = DEFAULT_CODE_BUFFER_INIT;
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 5c33739d28bbc5..318bac608a9419 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -1369,11 +1369,13 @@ class emitter
                                   // hot to cold and cold to hot jumps)
     };
 
+#ifdef FEATURE_LOOP_ALIGN
     struct instrDescAlign : instrDesc
     {
         instrDescAlign* idaNext; // next align in the group/method
         insGroup*     idaIG;   // containing group
     };
+#endif
 
 #if !defined(TARGET_ARM64) // This shouldn't be needed for ARM32, either, but I don't want to touch the ARM32 JIT.
     struct instrDescLbl : instrDescJmp
@@ -1752,9 +1754,18 @@ class emitter
     instrDescJmp* emitJumpLast;       // last of local jumps in method
     void          emitJumpDistBind(); // Bind all the local jumps in method
 
+#ifdef FEATURE_LOOP_ALIGN
+    instrDescAlign*   emitCurIGAlignList;     // list of align instructions in current IG
+    unsigned        emitLastInnerLoopStartIgNum; // Start IG of last inner loop
+    unsigned          emitLastInnerLoopEndIgNum; // End IG of last inner loop
+    unsigned        emitLastAlignedIgNum; // last IG that has align instruction
+    instrDescAlign* emitAlignList;    // list of local align instructions in method
+    instrDescAlign* emitAlignLast;    // last align instruction in method
     unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize); // Get the smallest loop size
     void emitSetLoopBackEdge(insGroup* dstIG);
     void emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments
+    unsigned emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool displayAlignmentDetails));
+#endif
 
     void emitCheckFuncletBranch(instrDesc* jmp, insGroup* jmpIG); // Check for illegal branches between funclets
 
@@ -1996,6 +2007,7 @@ class emitter
         return (instrDescCGCA*)emitAllocAnyInstr(sizeof(instrDescCGCA), attr);
     }
 
+#ifdef FEATURE_LOOP_ALIGN
     instrDescAlign* emitAllocInstrAlign()
     {
 #if EMITTER_STATS
@@ -2003,6 +2015,8 @@ class emitter
 #endif // EMITTER_STATS
         return (instrDescAlign*)emitAllocAnyInstr(sizeof(instrDescAlign), EA_1BYTE);
     }
+    instrDescAlign* emitNewInstrAlign();
+#endif
 
     instrDesc* emitNewInstrSmall(emitAttr attr);
     instrDesc* emitNewInstr(emitAttr attr = EA_4BYTE);
@@ -2019,7 +2033,6 @@ class emitter
     instrDescLbl* emitNewInstrLbl();
 #endif // !TARGET_ARM64
 
-    instrDescAlign*   emitNewInstrAlign();
     static const BYTE emitFmtToOps[];
 
 #ifdef DEBUG
@@ -2529,12 +2542,14 @@ inline emitter::instrDescJmp* emitter::emitNewInstrJmp()
     return emitAllocInstrJmp();
 }
 
+#ifdef FEATURE_LOOP_ALIGN
 inline emitter::instrDescAlign* emitter::emitNewInstrAlign()
 {
     instrDescAlign* newInstr = emitAllocInstrAlign();
     newInstr->idIns(INS_align);
     return newInstr;
 }
+#endif
 
 #if !defined(TARGET_ARM64)
 inline emitter::instrDescLbl* emitter::emitNewInstrLbl()
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 96ff299624546d..eedbf6b11015f8 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -874,9 +874,16 @@ unsigned emitter::emitOutputRexOrVexPrefixIfNeeded(instruction ins, BYTE* dst, c
         //   * W must be unset                    (0x00 validates bit 7)
         if ((vexPrefix & 0xFFFF7F80) == 0x00C46100)
         {
-            emitOutputByte(dst, 0xC5);
-            emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
-            return 2;
+            // Encoding optimization calculation is not done while estimating the instruction
+            // size and thus over-predict instruction size by 1 byte.
+            // If there are IGs that will be aligned, do not optimize encoding so the
+            // estimated alignment sizes are accurate.
+            if (emitCurIG->igNum > emitLastAlignedIgNum)
+            {
+                emitOutputByte(dst, 0xC5);
+                emitOutputByte(dst + 1, ((vexPrefix >> 8) & 0x80) | (vexPrefix & 0x7F));
+                return 2;
+            }
         }
 
         emitOutputByte(dst, ((vexPrefix >> 16) & 0xFF));
@@ -9373,241 +9380,45 @@ static BYTE* emitOutputNOP(BYTE* dst, size_t nBytes)
     return dst;
 }
 
-BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, size_t sz, BYTE* dst)
+//--------------------------------------------------------------------
+// emitOutputAlign: Outputs NOP to align the loop
+//
+// Arguments:
+//   ig - Current instruction group
+//   id - align instruction that holds amount of padding (NOPs) to add
+//   dst - Destination buffer
+//
+// Return Value:
+//   None.
+//
+// Notes:
+//   Amount of padding needed to align the loop is already calculated. This
+//   method extracts that information and insert those many NOP.
+//
+BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst)
 {
     // Candidate for loop alignment
     assert(codeGen->ShouldAlignLoops());
+    assert(ig->isLoopAlign());
 
-    unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
-    bool     skipPadding       = false;
-
-#if DEBUG
-    bool displayAlignmentDetails = (emitComp->opts.disAsm /*&& emitComp->opts.disAddr*/) || emitComp->verbose;
-#endif
-    // Check if we already detected that this IG does not need alignment
-    if ((ig->igFlags & IGF_ALIGN_LOOP) == 0)
-    {
-        skipPadding = true;
-#if DEBUG
-        if (displayAlignmentDetails)
-        {
-            printf("\t\t;; Skip alignment: 'Big loop.' in (%s)\n", emitComp->info.compFullName);
-        }
-#endif
-    }
-
-    // Check if the loop is already at alignment boundary
-    if (((size_t)dst & (alignmentBoundary - 1)) == 0)
-    {
-        skipPadding = true;
-#if DEBUG
-        if (displayAlignmentDetails)
-        {
-            printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n", alignmentBoundary,
-                   emitComp->info.compMethodName);
-        }
-#endif
-    }
-
-    unsigned paddingToAdd = 0;
-    if (!skipPadding)
-    {
-        // Adaptive padding
-        if (emitComp->opts.compJitAlignLoopAdaptive)
-        {
-            // Start to align on 32B boundary with a fallback to 16B boundary
-            int      maxBlocksAllowedForLoop = genLog2(alignmentBoundary) - 1;
-            unsigned maxLoopSize             = alignmentBoundary * maxBlocksAllowedForLoop;
-            unsigned loopSize                = getLoopSize(ig->igNext, maxLoopSize);
-            unsigned minBlocksNeededForLoop  = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
-            unsigned nMaxPaddingBytes        = (1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1)) - 1;
-            unsigned nPaddingBytes           = (-(int)(size_t)dst) & (alignmentBoundary - 1);
-
-            // Check if the loop exceed maxSize
-            if (loopSize > maxLoopSize)
-            {
-                skipPadding = true;
-                assert(!"Should never hit maxLoopSize threshold because it should have been predicted "
-                        "earlier.");
-            }
-
-            // Check if the alignment exceeds maxPadding limit
-            else if (nPaddingBytes > nMaxPaddingBytes)
-            {
-                // Cannot align to 32B, so try to align to 16B boundary.
-                alignmentBoundary >>= 1;
-                nMaxPaddingBytes = 1 << (maxBlocksAllowedForLoop - minBlocksNeededForLoop + 1);
-                nPaddingBytes    = (-(int)(size_t)dst) & (alignmentBoundary - 1);
-
-                // Check if the loop is already at new alignment boundary
-                if (nPaddingBytes == 0)
-                {
-                    skipPadding = true;
-#if DEBUG
-                    if (displayAlignmentDetails)
-                    {
-                        printf("\t\t;; Skip alignment: 'Loop already aligned at 16B boundary.' in (%s)\n",
-                               emitComp->info.compMethodName);
-                    }
-#endif
-                }
-                // Check if the alignment exceeds new maxPadding limit
-                else if (nPaddingBytes > nMaxPaddingBytes)
-                {
-                    skipPadding = true;
-#if DEBUG
-                    if (displayAlignmentDetails)
-                    {
-                        printf("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
-                               "AlignmentBoundary= %dB.' in (%s)\n",
-                               nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary,
-                               emitComp->info.compFullName);
-                    }
-#endif
-                }
-            }
-
-            if (!skipPadding)
-            {
-                // Padding is needed only if loop starts at or after the current offset.
-                // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
-                size_t extraBytesNotInLoop =
-                    (size_t)(emitComp->opts.compJitAlignLoopBoundary * minBlocksNeededForLoop) - loopSize;
-                size_t currentOffset = (size_t)dst % alignmentBoundary;
-
-                // Check if loop starts from offset such that padding can be skipped.
-                if (currentOffset <= extraBytesNotInLoop)
-                {
-                    skipPadding = true;
-#if DEBUG
-                    if (displayAlignmentDetails)
-                    {
-                        printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n",
-                               alignmentBoundary, emitComp->info.compMethodName);
-                    }
-#endif
-                }
-                else
-                {
-                    // Perform the padding
-                    paddingToAdd = nPaddingBytes;
-                }
-            }
-        }
-        // Non-adaptive padding
-        else
-        {
-            instrDesc* nextId = id;
-            castto(nextId, BYTE*) += sz;
+    unsigned paddingToAdd = id->idCodeSize();
 
-            // For padding > 15 bytes, check if we already performed/skipped
-            // padding during previous INS_align instruction.
-            // If yes, skip for current instruction as well as next, if that
-            // too is INS_align.
-            if ((id->idCodeSize() == 0))
-            {
-                if (nextId->idIns() == INS_align)
-                {
-                    assert(alignmentBoundary > 16);
-                    nextId->idCodeSize(0);
-                }
-                return dst;
-            }
+    // Either things are already aligned or align them here.
+    assert((paddingToAdd == 0) || (((size_t)dst & (emitComp->opts.compJitAlignLoopBoundary - 1)) != 0));
 
-            unsigned short maxLoopSize         = emitComp->opts.compJitAlignLoopMaxCodeSize;
-            unsigned       loopSize            = getLoopSize(ig->igNext, maxLoopSize);
-            unsigned       minimumBlocksNeeded = (loopSize + alignmentBoundary - 1) / alignmentBoundary;
-            unsigned       extraBytesNotInLoop = (alignmentBoundary * minimumBlocksNeeded) - loopSize;
-            unsigned       currentOffset       = (size_t)dst % alignmentBoundary;
+    // Padding amount should not exceed the alignment boundary
+    assert(0 <= paddingToAdd && paddingToAdd < emitComp->opts.compJitAlignLoopBoundary);
 
 #ifdef DEBUG
-            // Mitigate JCC erratum by making sure the jmp doesn't fall on the boundary
-            if (emitComp->opts.compJitAlignLoopForJcc)
-            {
-                // TODO: See if extra padding we might end up adding to mitigate JCC erratum is worth doing?
-                currentOffset++;
-            }
-#endif
-            // Check if the loop exceed maxSize
-            if (loopSize > maxLoopSize)
-            {
-                skipPadding = true;
-                assert(!"Should never hit maxLoopSize threshold because it should have been predicted "
-                        "earlier.");
-            }
+    bool     displayAlignmentDetails = (emitComp->opts.disAsm /*&& emitComp->opts.disAddr*/) || emitComp->verbose;
+    unsigned paddingNeeded           = emitCalculatePaddingForLoopAlignment(ig, (size_t)dst, displayAlignmentDetails);
 
-            // Padding is needed only if loop starts at or after the current offset.
-            // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
-            else if (currentOffset <= extraBytesNotInLoop)
-            {
-                skipPadding = true;
-#if DEBUG
-                if (displayAlignmentDetails)
-                {
-                    printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n",
-                           alignmentBoundary, emitComp->info.compMethodName);
-                }
+    // For non-adaptive, padding size is spread in multiple instructions, so don't bother checking
+    // unless non-adaptive approach is ON by default
+    assert((paddingToAdd == paddingNeeded) || !emitComp->opts.compJitAlignLoopAdaptive);
 #endif
-            }
-            else
-            {
-                // Perform the padding
-                paddingToAdd = (-(int)(size_t)dst) & (alignmentBoundary - 1);
-            }
-
-            // For padding > 15 bytes, multiple INS_align(15) are emitted.
-            // If decided to skipPadding, just mark it so for future INS_align
-            // instructions as well.
-            if (!skipPadding)
-            {
-                if (nextId->idIns() == INS_align)
-                {
-                    assert(alignmentBoundary > 16);
-                    nextId->idCodeSize(0);
-                }
-            }
-        }
-    }
 
-    // Add the padding, if needed.
-    if (paddingToAdd > 0)
-    {
-        assert(!skipPadding);
-        assert(((size_t)dst & (alignmentBoundary - 1)) != 0);
-
-        unsigned padCounts   = paddingToAdd / 15;
-        unsigned lastPadding = paddingToAdd % 15;
-
-        // TODO: For padding > 15 bytes, evaluate the sequence of NOPs emitted
-        //      and see if they can be improved.
-        while (padCounts)
-        {
-            dst = emitOutputNOP(dst, 15);
-            padCounts--;
-        }
-
-        dst = emitOutputNOP(dst, lastPadding);
-
-#if DEBUG
-        if (displayAlignmentDetails)
-        {
-            printf("\t\t;; Add alignment: 'Padding= %d, AlignmentBoundary= %dB.' in (%s)\n", paddingToAdd,
-                   alignmentBoundary, emitComp->info.compFullName);
-        }
-#endif
-        // In the end dst should be at alignment boundary
-        assert(((size_t)dst & (alignmentBoundary - 1)) == 0);
-    }
-
-    // If we didn't add as much padding as we thought, update the code size and flag.
-    if (paddingToAdd != id->idCodeSize())
-    {
-        assert(paddingToAdd != 0 || skipPadding);
-        id->idCodeSize(paddingToAdd);
-        ig->igFlags |= IGF_UPD_ISZ;
-    }
-
-    return dst;
+    return emitOutputNOP(dst, paddingToAdd);
 }
 
 /*****************************************************************************
@@ -12883,8 +12694,13 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             // the loop alignment pseudo instruction
             if (ins == INS_align)
             {
-                dst = emitOutputAlign(ig, id, sz, dst);
                 sz  = sizeof(instrDescAlign);
+                // IG can be marked as not needing alignment after emitting align instruction
+                // In such case, skip outputting alignment.
+                if (ig->isLoopAlign())
+                {
+                    dst = emitOutputAlign(ig, id, dst);
+                }
                 break;
             }
 
@@ -13989,6 +13805,24 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         emitDispIns(id, false, dspOffs, true, emitCurCodeOffs(*dp), *dp, (dst - *dp));
     }
 
+#ifdef FEATURE_LOOP_ALIGN
+    // Only compensate over-estimated instructions if emitCurIG is before
+    // the last IG that needs alignment.
+    if (emitCurIG->igNum <= emitLastAlignedIgNum)
+    {
+         unsigned diff = id->idCodeSize() - ((UNATIVE_OFFSET)(dst - *dp));
+        if (diff != 0)
+        {
+            // should never over-estimate align instruction
+            assert(id->idIns() != INS_align);
+            JITDUMP("Added over-estimation compensation: %d\n", diff);
+
+            dst = emitOutputNOP(dst, diff);
+        }
+        assert((id->idCodeSize() - ((UNATIVE_OFFSET)(dst - *dp))) == 0);
+    }
+#endif
+
     if (emitComp->compDebugBreak)
     {
         // set JitEmitPrintRefRegs=1 will print out emitThisGCrefRegs and emitThisByrefRegs
diff --git a/src/coreclr/jit/emitxarch.h b/src/coreclr/jit/emitxarch.h
index c27e5c851e2b76..b0a8327acedb69 100644
--- a/src/coreclr/jit/emitxarch.h
+++ b/src/coreclr/jit/emitxarch.h
@@ -50,7 +50,7 @@ UNATIVE_OFFSET emitInsSizeAM(instrDesc* id, code_t code, int val);
 UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code);
 UNATIVE_OFFSET emitInsSizeCV(instrDesc* id, code_t code, int val);
 
-BYTE* emitOutputAlign(insGroup* ig, instrDesc* id, size_t sz, BYTE* dst);
+BYTE* emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst);
 BYTE* emitOutputAM(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
 BYTE* emitOutputSV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
 BYTE* emitOutputCV(BYTE* dst, instrDesc* id, code_t code, CnsVal* addc = nullptr);
@@ -288,7 +288,7 @@ inline emitAttr emitDecodeScale(unsigned ensz)
 /************************************************************************/
 
 public:
-void emitLoopAlign();
+void emitLoopAlign(unsigned short paddingBytes = 15);
 
 void emitLongLoopAlign(unsigned short alignmentBoundary);
 
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 5a0c0cef7683ef..9345be1929a1d3 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -5177,9 +5177,10 @@ void Compiler::optCloneLoop(unsigned loopInd, LoopCloneContext* context)
 {
     assert(loopInd < optLoopCount);
 
-    JITDUMP("\nCloning loop %d: [h: %d, f: %d, t: %d, e: %d, b: %d].\n", loopInd, optLoopTable[loopInd].lpHead->bbNum,
-            optLoopTable[loopInd].lpFirst->bbNum, optLoopTable[loopInd].lpTop->bbNum,
-            optLoopTable[loopInd].lpEntry->bbNum, optLoopTable[loopInd].lpBottom->bbNum);
+    JITDUMP("\nCloning loop %d: [h: %d, f: %d, t: %d, e: %d, b: %d, c: %d].\n", loopInd,
+            optLoopTable[loopInd].lpHead->bbNum, optLoopTable[loopInd].lpFirst->bbNum,
+            optLoopTable[loopInd].lpTop->bbNum, optLoopTable[loopInd].lpEntry->bbNum,
+            optLoopTable[loopInd].lpBottom->bbNum, optLoopTable[loopInd].lpChild);
 
     // Determine the depth of the loop, so we can properly weight blocks added (outside the cloned loop blocks).
     unsigned             depth         = optLoopDepth(loopInd);
@@ -8014,6 +8015,7 @@ void Compiler::AddContainsCallAllContainingLoops(unsigned lnum)
         lnum                              = optLoopTable[lnum].lpParent;
     }
 
+#ifdef FEATURE_LOOP_ALIGN
     // If this is the inner most loop, reset the LOOP_ALIGN flag
     // because a loop having call will not likely to benefit from
     // alignment
@@ -8024,6 +8026,7 @@ void Compiler::AddContainsCallAllContainingLoops(unsigned lnum)
         JITDUMP("Skip alignment for L%02u that starts at " FMT_BB " because loop has a call.\n", nestedLoopNum,
                 first->bbNum);
     }
+#endif
 }
 
 // Adds the variable liveness information for 'blk' to 'this' LoopDsc

From 305b812596e4edf6ad540ae7ce91e91adbc9eec9 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Tue, 15 Dec 2020 18:16:34 -0800
Subject: [PATCH 40/59] jit format

---
 src/coreclr/jit/codegenlinear.cpp |  6 +++--
 src/coreclr/jit/emit.cpp          | 45 +++++++++++++++----------------
 src/coreclr/jit/emit.h            | 16 +++++------
 src/coreclr/jit/emitxarch.cpp     |  8 +++---
 4 files changed, 38 insertions(+), 37 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index d06479c3de7d91..ce1454bcfd8947 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -791,8 +791,10 @@ void CodeGen::genCodeForBBlist()
             // all IGs that follows this IG and participate in a loop.
             GetEmitter()->emitCurIG->igFlags |= IGF_LOOP_ALIGN;
 
-            JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u (" FMT_BB ") to align loop header block (" FMT_BB ").\n",
-                    compiler->opts.compJitAlignLoopBoundary, compiler->compMethodID, GetEmitter()->emitCurIG->igNum, block->bbNum, block->bbNext->bbNum);
+            JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u (" FMT_BB
+                    ") to align loop header block (" FMT_BB ").\n",
+                    compiler->opts.compJitAlignLoopBoundary, compiler->compMethodID, GetEmitter()->emitCurIG->igNum,
+                    block->bbNum, block->bbNext->bbNum);
         }
 #endif
 
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 148e3f70719292..6e412faf02d55c 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -848,7 +848,7 @@ insGroup* emitter::emitSavIG(bool emitAdd)
 
             // Figure out the address of where the align got copied
 
-            size_t of = (BYTE*)oa - emitCurIGfreeBase;
+            size_t          of = (BYTE*)oa - emitCurIGfreeBase;
             instrDescAlign* na = (instrDescAlign*)(ig->igData + of);
 
             assert(na->idaIG == ig);
@@ -877,7 +877,7 @@ insGroup* emitter::emitSavIG(bool emitAdd)
         }
         else
         {
-            last->idaNext = nullptr;
+            last->idaNext          = nullptr;
             emitAlignLast->idaNext = list;
         }
 
@@ -998,9 +998,9 @@ void emitter::emitBegFN(bool hasFramePtr
     emitIGbuffSize    = 0;
 
 #ifdef FEATURE_LOOP_ALIGN
-    emitLastAlignedIgNum = 0;
+    emitLastAlignedIgNum        = 0;
     emitLastInnerLoopStartIgNum = 0;
-    emitLastInnerLoopEndIgNum = 0;
+    emitLastInnerLoopEndIgNum   = 0;
 #endif
 
     /* Record stack frame info (the temp size is just an estimate) */
@@ -1038,7 +1038,6 @@ void emitter::emitBegFN(bool hasFramePtr
     emitNoGCIG     = false;
     emitForceNewIG = false;
 
-
 #ifdef FEATURE_LOOP_ALIGN
     /* We don't have any align instructions */
 
@@ -4612,7 +4611,6 @@ void emitter::emitJumpDistBind()
 #endif // DEBUG
 }
 
-
 #ifdef FEATURE_LOOP_ALIGN
 
 //-----------------------------------------------------------------------------
@@ -4742,14 +4740,14 @@ void emitter::emitLoopAlignAdjustments()
     else
     {
         // For non-adaptive, just take whatever is supplied using COMPlus_ variables
-        maxLoopSize     = emitComp->opts.compJitAlignLoopMaxCodeSize;
+        maxLoopSize            = emitComp->opts.compJitAlignLoopMaxCodeSize;
         estimatedPaddingNeeded = alignmentBoundary - 1;
     }
 
-    unsigned alignBytesRemoved = 0;
-    unsigned loopSize          = 0;
-    unsigned loopIGOffset      = 0;
-    instrDescAlign* alignInstr = emitAlignList;
+    unsigned        alignBytesRemoved = 0;
+    unsigned        loopSize          = 0;
+    unsigned        loopIGOffset      = 0;
+    instrDescAlign* alignInstr        = emitAlignList;
 
     // track the IG that was adjusted so we can update the offsets
     insGroup* lastIGAdj = emitAlignList->idaIG;
@@ -4801,14 +4799,14 @@ void emitter::emitLoopAlignAdjustments()
             }
             else
             {
-                unsigned paddingToAdj     = actualPaddingNeeded;
+                unsigned paddingToAdj = actualPaddingNeeded;
 
 #ifdef DEBUG
-                
+
                 int instrAdjusted = (alignmentBoundary + 14) / 15;
 #endif
                 // Adjust the padding amount in all align instructions in this IG
-                instrDescAlign *alignInstrToAdj = alignInstr, *prevAlignInstr = nullptr; 
+                instrDescAlign *alignInstrToAdj = alignInstr, *prevAlignInstr = nullptr;
                 for (; alignInstrToAdj != nullptr && alignInstrToAdj->idaIG == alignInstr->idaIG;
                      alignInstrToAdj = alignInstrToAdj->idaNext)
                 {
@@ -4828,7 +4826,7 @@ void emitter::emitLoopAlignAdjustments()
             }
 
             JITDUMP("Adjusted alignment of G_M%03u_IG%02u from %02d to %02d\n", emitComp->compMethodID, alignIG->igNum,
-                   estimatedPaddingNeeded, actualPaddingNeeded);
+                    estimatedPaddingNeeded, actualPaddingNeeded);
         }
     }
 
@@ -4870,7 +4868,8 @@ void emitter::emitLoopAlignAdjustments()
 //     3b. If the loop already fits in minimum alignmentBoundary blocks, then return 0. // already best aligned
 //     3c. return paddingNeeded.
 //
-unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool displayAlignmentDetails))
+unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig,
+                                                       size_t offset DEBUG_ARG(bool displayAlignmentDetails))
 {
     assert(ig->isLoopAlign());
     unsigned alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
@@ -4888,8 +4887,8 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs
         return 0;
     }
 
-    unsigned       maxLoopSize = 0;
-    int            maxLoopBlocksAllowed = 0;
+    unsigned maxLoopSize          = 0;
+    int      maxLoopBlocksAllowed = 0;
 
     if (emitComp->opts.compJitAlignLoopAdaptive)
     {
@@ -4900,7 +4899,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs
     else
     {
         // For non-adaptive, just take whatever is supplied using COMPlus_ variables
-        maxLoopSize     = emitComp->opts.compJitAlignLoopMaxCodeSize;
+        maxLoopSize = emitComp->opts.compJitAlignLoopMaxCodeSize;
     }
 
     unsigned loopSize = getLoopSize(ig->igNext, maxLoopSize);
@@ -4918,8 +4917,8 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs
     if (emitComp->opts.compJitAlignLoopAdaptive)
     {
         // adaptive loop alignment
-        unsigned nMaxPaddingBytes       = (1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1)) - 1;
-        unsigned nPaddingBytes          = (-(int)(size_t)offset) & (alignmentBoundary - 1);
+        unsigned nMaxPaddingBytes = (1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1)) - 1;
+        unsigned nPaddingBytes    = (-(int)(size_t)offset) & (alignmentBoundary - 1);
 
         // Check if the alignment exceeds maxPadding limit
         if (nPaddingBytes > nMaxPaddingBytes)
@@ -4973,7 +4972,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs
             }
             else
             {
-                // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+// Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
 #if DEBUG
                 if (displayAlignmentDetails)
                 {
@@ -5007,7 +5006,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs
         }
         else
         {
-            // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+// Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
 #if DEBUG
             if (displayAlignmentDetails)
             {
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 318bac608a9419..ea136e528c1294 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -1373,7 +1373,7 @@ class emitter
     struct instrDescAlign : instrDesc
     {
         instrDescAlign* idaNext; // next align in the group/method
-        insGroup*     idaIG;   // containing group
+        insGroup*       idaIG;   // containing group
     };
 #endif
 
@@ -1755,15 +1755,15 @@ class emitter
     void          emitJumpDistBind(); // Bind all the local jumps in method
 
 #ifdef FEATURE_LOOP_ALIGN
-    instrDescAlign*   emitCurIGAlignList;     // list of align instructions in current IG
-    unsigned        emitLastInnerLoopStartIgNum; // Start IG of last inner loop
-    unsigned          emitLastInnerLoopEndIgNum; // End IG of last inner loop
-    unsigned        emitLastAlignedIgNum; // last IG that has align instruction
-    instrDescAlign* emitAlignList;    // list of local align instructions in method
-    instrDescAlign* emitAlignLast;    // last align instruction in method
+    instrDescAlign* emitCurIGAlignList;                                 // list of align instructions in current IG
+    unsigned        emitLastInnerLoopStartIgNum;                        // Start IG of last inner loop
+    unsigned        emitLastInnerLoopEndIgNum;                          // End IG of last inner loop
+    unsigned        emitLastAlignedIgNum;                               // last IG that has align instruction
+    instrDescAlign* emitAlignList;                                      // list of local align instructions in method
+    instrDescAlign* emitAlignLast;                                      // last align instruction in method
     unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize); // Get the smallest loop size
     void emitSetLoopBackEdge(insGroup* dstIG);
-    void emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments
+    void     emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments
     unsigned emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool displayAlignmentDetails));
 #endif
 
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index eedbf6b11015f8..9a75527197042e 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2669,7 +2669,7 @@ void emitter::emitLoopAlign(unsigned short paddingBytes)
     /* Insert a pseudo-instruction to ensure that we align
        the next instruction properly */
 
-    paddingBytes = min(paddingBytes, 15);  // We may need to skip up to 15 bytes of code
+    paddingBytes       = min(paddingBytes, 15); // We may need to skip up to 15 bytes of code
     instrDescAlign* id = emitNewInstrAlign();
     id->idCodeSize(paddingBytes);
     emitCurIGsize += paddingBytes;
@@ -2677,7 +2677,7 @@ void emitter::emitLoopAlign(unsigned short paddingBytes)
     id->idaIG = emitCurIG;
 
     /* Append this instruction to this IG's jump list */
-    id->idaNext = emitCurIGAlignList;
+    id->idaNext        = emitCurIGAlignList;
     emitCurIGAlignList = id;
 
     /* Record the last IG that has align instruction */
@@ -12694,7 +12694,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             // the loop alignment pseudo instruction
             if (ins == INS_align)
             {
-                sz  = sizeof(instrDescAlign);
+                sz = sizeof(instrDescAlign);
                 // IG can be marked as not needing alignment after emitting align instruction
                 // In such case, skip outputting alignment.
                 if (ig->isLoopAlign())
@@ -13810,7 +13810,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
     // the last IG that needs alignment.
     if (emitCurIG->igNum <= emitLastAlignedIgNum)
     {
-         unsigned diff = id->idCodeSize() - ((UNATIVE_OFFSET)(dst - *dp));
+        unsigned diff = id->idCodeSize() - ((UNATIVE_OFFSET)(dst - *dp));
         if (diff != 0)
         {
             // should never over-estimate align instruction

From f8bdfecf50df669dcaddec7638791f79909f61e1 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 16 Dec 2020 12:23:44 -0800
Subject: [PATCH 41/59] fix issue related to needLabel

---
 src/coreclr/jit/codegenlinear.cpp |  5 ++-
 src/coreclr/jit/emit.cpp          | 54 ++++++++-----------------------
 src/coreclr/jit/flowgraph.cpp     | 13 +++++---
 3 files changed, 24 insertions(+), 48 deletions(-)

diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index ce1454bcfd8947..ccecd692962e0e 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -791,10 +791,9 @@ void CodeGen::genCodeForBBlist()
             // all IGs that follows this IG and participate in a loop.
             GetEmitter()->emitCurIG->igFlags |= IGF_LOOP_ALIGN;
 
-            JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u (" FMT_BB
-                    ") to align loop header block (" FMT_BB ").\n",
+            JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u to align loop# %d.\n",
                     compiler->opts.compJitAlignLoopBoundary, compiler->compMethodID, GetEmitter()->emitCurIG->igNum,
-                    block->bbNum, block->bbNext->bbNum);
+                    block->bbNext->bbNatLoopNum);
         }
 #endif
 
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 6e412faf02d55c..300b8bfaf22e5e 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -4877,13 +4877,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig,
     // No padding if loop is already aligned
     if ((offset & (alignmentBoundary - 1)) == 0)
     {
-#if DEBUG
-        if (displayAlignmentDetails)
-        {
-            printf("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.' in (%s)\n", alignmentBoundary,
-                   emitComp->info.compMethodName);
-        }
-#endif
+        JITDUMP("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.'\n", alignmentBoundary);
         return 0;
     }
 
@@ -4907,6 +4901,8 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig,
     // No padding if loop is big
     if (loopSize > maxLoopSize)
     {
+        JITDUMP("\t\t;; Skip alignment: 'Loop is big. LoopSize= %d, MaxLoopSize= %d.'\n", alignmentBoundary, loopSize,
+                maxLoopSize);
         return 0;
     }
 
@@ -4932,26 +4928,15 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig,
             if (nPaddingBytes == 0)
             {
                 skipPadding = true;
-#if DEBUG
-                if (displayAlignmentDetails)
-                {
-                    printf("\t\t;; Skip alignment: 'Loop already aligned at 16B boundary.' in (%s)\n",
-                           emitComp->info.compMethodName);
-                }
-#endif
+                JITDUMP("\t\t;; Skip alignment: 'Loop already aligned at 16B boundary.'\n");
             }
             // Check if the alignment exceeds new maxPadding limit
             else if (nPaddingBytes > nMaxPaddingBytes)
             {
                 skipPadding = true;
-#if DEBUG
-                if (displayAlignmentDetails)
-                {
-                    printf("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
-                           "AlignmentBoundary= %dB.' in (%s)\n",
-                           nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary, emitComp->info.compFullName);
-                }
-#endif
+                JITDUMP("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
+                        "AlignmentBoundary= %dB.'\n",
+                        nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary);
             }
         }
 
@@ -4972,14 +4957,9 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig,
             }
             else
             {
-// Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
-#if DEBUG
-                if (displayAlignmentDetails)
-                {
-                    printf("\t\t;; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.' in (%s)\n",
-                           minBlocksNeededForLoop, alignmentBoundary, emitComp->info.compMethodName);
-                }
-#endif
+                // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+                JITDUMP("\t\t;; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n",
+                        minBlocksNeededForLoop, alignmentBoundary);
             }
         }
     }
@@ -5006,22 +4986,16 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig,
         }
         else
         {
-// Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
-#if DEBUG
-            if (displayAlignmentDetails)
-            {
-                printf("\t\t;; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.' in (%s)\n",
-                       minBlocksNeededForLoop, alignmentBoundary, emitComp->info.compMethodName);
-            }
-#endif
+            // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
+            JITDUMP("\t\t;; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n",
+                    minBlocksNeededForLoop, alignmentBoundary);
         }
     }
 
 #if DEBUG
     if (displayAlignmentDetails && paddingToAdd > 0)
     {
-        printf("\t\t;; Add alignment: 'Padding= %d, AlignmentBoundary= %dB.' in (%s)\n", paddingToAdd,
-               alignmentBoundary, emitComp->info.compFullName);
+        printf("\t\t;; Add alignment: 'Padding= %d, AlignmentBoundary= %dB.'\n", paddingToAdd, alignmentBoundary);
     }
 #endif
 
diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
index bcceb7d2763a6c..8ce88cb090ecb1 100644
--- a/src/coreclr/jit/flowgraph.cpp
+++ b/src/coreclr/jit/flowgraph.cpp
@@ -11536,6 +11536,14 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable)
         if (block->isLoopHead() && (succBlock->bbNum <= block->bbNum))
         {
             succBlock->bbFlags |= BBF_LOOP_HEAD;
+
+            if (block->isLoopAlign())
+            {
+                succBlock->bbFlags |= BBF_LOOP_ALIGN;
+                JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " for loop# %d.", block->bbNum,
+                        succBlock->bbNum, block->bbNatLoopNum);
+            }
+
             if (fgDomsComputed && fgReachable(succBlock, block))
             {
                 /* Mark all the reachable blocks between 'succBlock' and 'block', excluding 'block' */
@@ -11547,11 +11555,6 @@ void Compiler::fgRemoveBlock(BasicBlock* block, bool unreachable)
             skipUnmarkLoop = true;
         }
 
-        if (block->isLoopAlign())
-        {
-            succBlock->bbFlags |= BBF_LOOP_ALIGN;
-        }
-
         noway_assert(succBlock);
 
         // If this is the first Cold basic block update fgFirstColdBlock

From a205cc02475e7b01e1d4b36b6a7fea4727b5d224 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 16 Dec 2020 20:54:50 -0800
Subject: [PATCH 42/59] align memory correctly in superpmi

---
 .../ToolBox/superpmi/superpmi/icorjitinfo.cpp    | 16 +++++++++++++++-
 src/coreclr/jit/emit.cpp                         |  9 +++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp b/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp
index 78b0a464269541..691f9973ce2626 100644
--- a/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp
+++ b/src/coreclr/ToolBox/superpmi/superpmi/icorjitinfo.cpp
@@ -1609,7 +1609,21 @@ void MyICJI::allocMem(ULONG              hotCodeSize,   /* IN */
     jitInstance->mc->cr->AddCall("allocMem");
 
     // TODO-Cleanup: Could hot block size be ever 0?
-    *hotCodeBlock = jitInstance->mc->cr->allocateMemory(hotCodeSize);
+    size_t codeAlignment      = sizeof(void*);
+    size_t hotCodeAlignedSize = static_cast<size_t>(hotCodeSize);
+
+    if ((flag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0)
+    {
+         codeAlignment = 32;
+    }
+    else if ((flag & CORJIT_ALLOCMEM_FLG_16BYTE_ALIGN) != 0)
+    {
+         codeAlignment = 16;
+    }
+    hotCodeAlignedSize = ALIGN_UP_SPMI(hotCodeAlignedSize, codeAlignment);
+    hotCodeAlignedSize = hotCodeAlignedSize + (codeAlignment - sizeof(void*));
+    *hotCodeBlock      = jitInstance->mc->cr->allocateMemory(hotCodeAlignedSize);
+    *hotCodeBlock      = ALIGN_UP_SPMI(*hotCodeBlock, codeAlignment);
 
     if (coldCodeSize > 0)
         *coldCodeBlock = jitInstance->mc->cr->allocateMemory(coldCodeSize);
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 300b8bfaf22e5e..82c3462f8551b5 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -4727,6 +4727,8 @@ void emitter::emitLoopAlignAdjustments()
         return;
     }
 
+    JITDUMP("*************** In emitLoopAlignAdjustments()\n");
+
     unsigned short estimatedPaddingNeeded, alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
     unsigned       maxLoopSize = 0;
 
@@ -5370,6 +5372,13 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
                             (void**)&codeBlock, (void**)&coldCodeBlock, (void**)&consBlock);
 #endif
 
+#ifdef DEBUG
+    if ((allocMemFlag & CORJIT_ALLOCMEM_FLG_32BYTE_ALIGN) != 0)
+    {
+        assert(((size_t)codeBlock & 31) == 0);
+    }
+#endif
+
     // if (emitConsDsc.dsdOffs)
     //     printf("Cons=%08X\n", consBlock);
 

From d576e9cb8ef60ff0f69dee41517f5438c45745f7 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 17 Dec 2020 14:19:56 -0800
Subject: [PATCH 43/59] Few more fixes:

- emitOffsAdj takes into account for any mis-prediction of jump. If we compensate that mis-prediction, that off that adjustment.
- Record the lastAlignIG only for valid non-zero align instructions
---
 src/coreclr/jit/emit.cpp      |  7 ++++++-
 src/coreclr/jit/emitxarch.cpp | 29 ++++++++++++++++++++---------
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 82c3462f8551b5..1dbae793cc2fe7 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -882,7 +882,6 @@ insGroup* emitter::emitSavIG(bool emitAdd)
         }
 
         emitAlignLast = last;
-        assert(emitAlignLast->idaIG->igNum == emitLastAlignedIgNum);
     }
 
 #endif
@@ -4830,6 +4829,12 @@ void emitter::emitLoopAlignAdjustments()
             JITDUMP("Adjusted alignment of G_M%03u_IG%02u from %02d to %02d\n", emitComp->compMethodID, alignIG->igNum,
                     estimatedPaddingNeeded, actualPaddingNeeded);
         }
+
+        if (actualPaddingNeeded > 0)
+        {
+            /* Record the last IG that will have non-zero align instruction */
+            emitLastAlignedIgNum = alignIG->igNum;
+        }
     }
 
     // Do adjustments of remaining IGs
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 9a75527197042e..aefdb0bc82b097 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2679,9 +2679,6 @@ void emitter::emitLoopAlign(unsigned short paddingBytes)
     /* Append this instruction to this IG's jump list */
     id->idaNext        = emitCurIGAlignList;
     emitCurIGAlignList = id;
-
-    /* Record the last IG that has align instruction */
-    emitLastAlignedIgNum = emitCurIG->igNum;
 }
 
 //-----------------------------------------------------------------------------
@@ -12494,7 +12491,7 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i)
 #ifdef DEBUG
             if (emitComp->verbose)
             {
-                printf("; NOTE: size of jump [%08X] mis-predicted\n", emitComp->dspPtr(id));
+                printf("; NOTE: size of jump [%08X] mis-predicted by %d bytes\n", emitComp->dspPtr(id), (id->idCodeSize() - JMP_SIZE_SMALL));
             }
 #endif
         }
@@ -12655,10 +12652,11 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 {
     assert(emitIssuing);
 
-    BYTE*         dst           = *dp;
-    size_t        sz            = sizeof(instrDesc);
-    instruction   ins           = id->idIns();
-    unsigned char callInstrSize = 0;
+    BYTE*         dst               = *dp;
+    size_t        sz                = sizeof(instrDesc);
+    instruction   ins               = id->idIns();
+    unsigned char callInstrSize     = 0;
+    int           emitOffsAdjBefore = emitOffsAdj;
 
 #ifdef DEBUG
     bool dspOffs = emitComp->opts.dspGCtbls;
@@ -13810,7 +13808,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
     // the last IG that needs alignment.
     if (emitCurIG->igNum <= emitLastAlignedIgNum)
     {
-        unsigned diff = id->idCodeSize() - ((UNATIVE_OFFSET)(dst - *dp));
+        int diff = id->idCodeSize() - ((UNATIVE_OFFSET)(dst - *dp));
+        assert(diff >= 0);
         if (diff != 0)
         {
             // should never over-estimate align instruction
@@ -13818,6 +13817,18 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             JITDUMP("Added over-estimation compensation: %d\n", diff);
 
             dst = emitOutputNOP(dst, diff);
+
+            // since we compensated the over-estimation, revert the offsAdj that
+            // might have happened in the jump
+            if (emitOffsAdjBefore != emitOffsAdj)
+            {
+#ifdef DEBUG
+                insFormat format = id->idInsFmt();
+                assert((format == IF_LABEL) || (format == IF_RWR_LABEL) || (format == IF_SWR_LABEL));
+                assert(diff == (emitOffsAdj - emitOffsAdjBefore));
+#endif
+                emitOffsAdj -= diff;
+            }
         }
         assert((id->idCodeSize() - ((UNATIVE_OFFSET)(dst - *dp))) == 0);
     }

From 28480d1a149edb9e89cedb503e6f2867844acb46 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 17 Dec 2020 16:02:37 -0800
Subject: [PATCH 44/59] minor JITDUMP messages

---
 src/coreclr/jit/emit.cpp | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 1dbae793cc2fe7..66835def25ef3c 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -4884,7 +4884,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig,
     // No padding if loop is already aligned
     if ((offset & (alignmentBoundary - 1)) == 0)
     {
-        JITDUMP("\t\t;; Skip alignment: 'Loop already aligned at %dB boundary.'\n", alignmentBoundary);
+        JITDUMP(";; Skip alignment: 'Loop already aligned at %dB boundary.'\n", alignmentBoundary);
         return 0;
     }
 
@@ -4908,7 +4908,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig,
     // No padding if loop is big
     if (loopSize > maxLoopSize)
     {
-        JITDUMP("\t\t;; Skip alignment: 'Loop is big. LoopSize= %d, MaxLoopSize= %d.'\n", alignmentBoundary, loopSize,
+        JITDUMP(";; Skip alignment: 'Loop is big. LoopSize= %d, MaxLoopSize= %d.'\n", alignmentBoundary, loopSize,
                 maxLoopSize);
         return 0;
     }
@@ -4935,13 +4935,13 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig,
             if (nPaddingBytes == 0)
             {
                 skipPadding = true;
-                JITDUMP("\t\t;; Skip alignment: 'Loop already aligned at 16B boundary.'\n");
+                JITDUMP(";; Skip alignment: 'Loop already aligned at 16B boundary.'\n");
             }
             // Check if the alignment exceeds new maxPadding limit
             else if (nPaddingBytes > nMaxPaddingBytes)
             {
                 skipPadding = true;
-                JITDUMP("\t\t;; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
+                JITDUMP(";; Skip alignment: 'PaddingNeeded= %d, MaxPadding= %d, LoopSize= %d, "
                         "AlignmentBoundary= %dB.'\n",
                         nPaddingBytes, nMaxPaddingBytes, loopSize, alignmentBoundary);
             }
@@ -4965,7 +4965,7 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig,
             else
             {
                 // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
-                JITDUMP("\t\t;; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n",
+                JITDUMP(";; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n",
                         minBlocksNeededForLoop, alignmentBoundary);
             }
         }
@@ -4994,17 +4994,12 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig,
         else
         {
             // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
-            JITDUMP("\t\t;; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n",
+            JITDUMP(";; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n",
                     minBlocksNeededForLoop, alignmentBoundary);
         }
     }
 
-#if DEBUG
-    if (displayAlignmentDetails && paddingToAdd > 0)
-    {
-        printf("\t\t;; Add alignment: 'Padding= %d, AlignmentBoundary= %dB.'\n", paddingToAdd, alignmentBoundary);
-    }
-#endif
+    JITDUMP(";; Calculated padding to add %d bytes to align at %dB boudnary.'\n", paddingToAdd, alignmentBoundary);
 
     // Either no padding is added because it is too expensive or the offset gets aligned
     // to the alignment boundary

From bf03842f9709996932f603c6199ff3c530574a42 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 17 Dec 2020 18:03:09 -0800
Subject: [PATCH 45/59] Review comments

---
 src/coreclr/inc/corjitflags.h                 |  1 +
 src/coreclr/jit/block.cpp                     |  2 +-
 src/coreclr/jit/block.h                       |  2 +-
 src/coreclr/jit/codegenlinear.cpp             | 26 +++-------
 src/coreclr/jit/compiler.cpp                  |  2 +-
 src/coreclr/jit/compiler.h                    |  5 +-
 src/coreclr/jit/emit.cpp                      | 37 +++++++++----
 src/coreclr/jit/emit.h                        | 12 +++--
 src/coreclr/jit/emitxarch.cpp                 | 52 +++++++++++++------
 src/coreclr/jit/jitee.h                       |  1 +
 src/coreclr/jit/optimizer.cpp                 | 23 ++++----
 .../tools/Common/JitInterface/CorInfoTypes.cs |  1 +
 12 files changed, 97 insertions(+), 67 deletions(-)

diff --git a/src/coreclr/inc/corjitflags.h b/src/coreclr/inc/corjitflags.h
index 0c6eacaee02e53..6add94e0c4b357 100644
--- a/src/coreclr/inc/corjitflags.h
+++ b/src/coreclr/inc/corjitflags.h
@@ -79,6 +79,7 @@ class CORJIT_FLAGS
         CORJIT_FLAG_BBINSTR                 = 29, // Collect basic block profile information
         CORJIT_FLAG_BBOPT                   = 30, // Optimize method based on profile information
         CORJIT_FLAG_FRAMED                  = 31, // All methods have an EBP frame
+        CORJIT_FLAG_UNUSED35                = 32,
         CORJIT_FLAG_PUBLISH_SECRET_PARAM    = 33, // JIT must place stub secret param into local 0.  (used by IL stubs)
         CORJIT_FLAG_UNUSED12                = 34,
         CORJIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background
diff --git a/src/coreclr/jit/block.cpp b/src/coreclr/jit/block.cpp
index aa5a72cbfff63b..6cea8dd2c367a2 100644
--- a/src/coreclr/jit/block.cpp
+++ b/src/coreclr/jit/block.cpp
@@ -507,7 +507,7 @@ void BasicBlock::dspFlags()
     }
     if (bbFlags & BBF_LOOP_ALIGN)
     {
-        printf("finnerloop ");
+        printf("align ");
     }
 }
 
diff --git a/src/coreclr/jit/block.h b/src/coreclr/jit/block.h
index d7f49f067fe191..d92f5b2c3550c1 100644
--- a/src/coreclr/jit/block.h
+++ b/src/coreclr/jit/block.h
@@ -448,7 +448,7 @@ struct BasicBlock : private LIR::Range
 
 #define BBF_PATCHPOINT                     MAKE_BBFLAG(36) // Block is a patchpoint
 #define BBF_HAS_CLASS_PROFILE              MAKE_BBFLAG(37) // BB contains a call needing a class profile
-#define BBF_LOOP_ALIGN                     MAKE_BBFLAG(39) // Block is lexically the fist block within the innermost loop.
+#define BBF_LOOP_ALIGN                     MAKE_BBFLAG(39) // Block is lexically the first block in a loop we intend to align.
 
 // clang-format on
 
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index ccecd692962e0e..a2796969b26015 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -349,8 +349,11 @@ void CodeGen::genCodeForBBlist()
             needLabel = true;
         }
 
-        // Make sure we did not add align instruction in the middle of IG.
-        assert(needLabel || !GetEmitter()->emitCurIG->isLoopAlign());
+        if (GetEmitter()->emitCurIG->isLoopAlign())
+        {
+            // we had better be planning on starting a new IG
+            assert(needLabel);
+        }
 
         if (needLabel)
         {
@@ -753,7 +756,7 @@ void CodeGen::genCodeForBBlist()
 
                 if (block->bbJumpDest->isLoopAlign())
                 {
-                    GetEmitter()->emitSetLoopBackEdge((insGroup*)block->bbJumpDest->bbEmitCookie);
+                    GetEmitter()->emitSetLoopBackEdge(block->bbJumpDest);
                 }
 #endif
                 break;
@@ -778,22 +781,7 @@ void CodeGen::genCodeForBBlist()
         {
             assert(ShouldAlignLoops());
 
-            if ((compiler->opts.compJitAlignLoopBoundary > 16) && (!compiler->opts.compJitAlignLoopAdaptive))
-            {
-                GetEmitter()->emitLongLoopAlign(compiler->opts.compJitAlignLoopBoundary);
-            }
-            else
-            {
-                GetEmitter()->emitLoopAlign();
-            }
-
-            // Mark this IG as need alignment so during emitter we can check the instruction count heuristics of
-            // all IGs that follows this IG and participate in a loop.
-            GetEmitter()->emitCurIG->igFlags |= IGF_LOOP_ALIGN;
-
-            JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u to align loop# %d.\n",
-                    compiler->opts.compJitAlignLoopBoundary, compiler->compMethodID, GetEmitter()->emitCurIG->igNum,
-                    block->bbNext->bbNatLoopNum);
+            GetEmitter()->emitLoopAlignment();
         }
 #endif
 
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 18664ef3fa5aa7..1d13bd85488035 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -3932,7 +3932,7 @@ void Compiler::compSetOptimizationLevel()
             codeGen->setFrameRequired(true);
 #endif
 
-        if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_RELOC))
+        if (opts.jitFlags->IsSet(JitFlags::JIT_FLAG_PREJIT))
         {
             // The JIT doesn't currently support loop alignment for prejitted images.
             // (The JIT doesn't know the final address of the code, hence
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 5329934855a9bc..b148d4cf054d52 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -9049,9 +9049,8 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 #define DEFAULT_ALIGN_LOOP_BOUNDARY 0x20
 
 // For non-adaptive loop alignment, by default, only align a loop whose size is
-// atmost 3 times of 32B chunk. If the loop is bigger than that, it is most
-// likely the loop code is complicated enough and aligning such loop will not help
-// much.
+// at most 3 times the alignment block size. If the loop is bigger than that, it is most
+// likely complicated enough that loop alignment will not impact performance.
 #define DEFAULT_MAX_LOOPSIZE_FOR_ALIGN DEFAULT_ALIGN_LOOP_BOUNDARY * 3
 
 #ifdef DEBUG
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 66835def25ef3c..42d1001acc85d0 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -882,6 +882,7 @@ insGroup* emitter::emitSavIG(bool emitAdd)
         }
 
         emitAlignLast = last;
+        assert(emitAlignLast->idaIG->igNum == emitLastAlignedIgNum);
     }
 
 #endif
@@ -4612,6 +4613,25 @@ void emitter::emitJumpDistBind()
 
 #ifdef FEATURE_LOOP_ALIGN
 
+void emitter::emitLoopAlignment()
+{
+    if ((emitComp->opts.compJitAlignLoopBoundary > 16) && (!emitComp->opts.compJitAlignLoopAdaptive))
+    {
+        emitLongLoopAlign(emitComp->opts.compJitAlignLoopBoundary);
+    }
+    else
+    {
+        emitLoopAlign();
+    }
+
+    // Mark this IG as need alignment so during emitter we can check the instruction count heuristics of
+    // all IGs that follows this IG and participate in a loop.
+    emitCurIG->igFlags |= IGF_LOOP_ALIGN;
+
+    JITDUMP("Adding 'align' instruction of %d bytes in G_M%03u_IG%02u.\n", emitComp->opts.compJitAlignLoopBoundary,
+            emitComp->compMethodID, emitCurIG->igNum);
+}
+
 //-----------------------------------------------------------------------------
 //  For loopHeaderIg, find the size of the smallest possible loop that doesn't exceed maxLoopSize.
 //
@@ -4659,8 +4679,10 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize)
 //    If the current loop covers a loop that is already marked as align, then remove
 //    the alignment flag present on IG before dstIG.
 //
-void emitter::emitSetLoopBackEdge(insGroup* dstIG)
+void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock)
 {
+    insGroup* dstIG = (insGroup*)loopTopBlock->bbJumpDest->bbEmitCookie;
+
     // With (dstIG != nullptr), ensure that only back edges are tracked.
     // If there is forward jump, dstIG is not yet generated.
     //
@@ -4829,12 +4851,6 @@ void emitter::emitLoopAlignAdjustments()
             JITDUMP("Adjusted alignment of G_M%03u_IG%02u from %02d to %02d\n", emitComp->compMethodID, alignIG->igNum,
                     estimatedPaddingNeeded, actualPaddingNeeded);
         }
-
-        if (actualPaddingNeeded > 0)
-        {
-            /* Record the last IG that will have non-zero align instruction */
-            emitLastAlignedIgNum = alignIG->igNum;
-        }
     }
 
     // Do adjustments of remaining IGs
@@ -4994,12 +5010,13 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig,
         else
         {
             // Otherwise, the loop just fits in minBlocksNeededForLoop and so can skip alignment.
-            JITDUMP(";; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n",
-                    minBlocksNeededForLoop, alignmentBoundary);
+            JITDUMP(";; Skip alignment: 'Loop is aligned to fit in %d blocks of %d chunks.'\n", minBlocksNeededForLoop,
+                    alignmentBoundary);
         }
     }
 
-    JITDUMP(";; Calculated padding to add %d bytes to align at %dB boudnary.'\n", paddingToAdd, alignmentBoundary);
+    JITDUMP(";; Calculated padding to add %d bytes to align at %dB boundary that starts at 0x%x.'\n", paddingToAdd,
+            alignmentBoundary, offset);
 
     // Either no padding is added because it is too expensive or the offset gets aligned
     // to the alignment boundary
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index ea136e528c1294..4f8fd00b931646 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -250,7 +250,7 @@ struct insGroup
     unsigned int   igFuncIdx; // Which function/funclet does this belong to? (Index into Compiler::compFuncInfos array.)
     unsigned short igFlags;   // see IGF_xxx below
     unsigned short igSize;    // # of bytes of code in this group
-    insGroup*      igLoopBackEdge; // Back-edge that points to the loop head.
+    insGroup*      igLoopBackEdge; // "first" back-edge that branches back to an aligned loop head.
 
 #define IGF_GC_VARS 0x0001    // new set of live GC ref variables
 #define IGF_BYREF_REGS 0x0002 // new set of live by-ref registers
@@ -265,8 +265,8 @@ struct insGroup
 #define IGF_PLACEHOLDER 0x0100    // this is a placeholder group, to be filled in later
 #define IGF_EXTEND 0x0200         // this block is conceptually an extension of the previous block
                                   // and the emitter should continue to track GC info as if there was no new block.
-#define IGF_LOOP_ALIGN 0x0400     // this group contains alignment instruction at the end because the next IG points
-                                  // to inner loop that needs alignment.
+#define IGF_LOOP_ALIGN 0x0400     // this group contains alignment instruction(s) at the end; the next IG is the
+                                  // head of a loop that needs alignment.
 
 // Mask of IGF_* flags that should be propagated to new blocks when they are created.
 // This allows prologs and epilogs to be any number of IGs, but still be
@@ -569,6 +569,7 @@ class emitter
 #if defined(TARGET_XARCH)
         static_assert_no_msg(INS_count <= 1024);
         instruction _idIns : 10;
+#define MAX_ENCODED_SIZE 15
 #elif defined(TARGET_ARM64)
         static_assert_no_msg(INS_count <= 512);
         instruction _idIns : 9;
@@ -1762,7 +1763,8 @@ class emitter
     instrDescAlign* emitAlignList;                                      // list of local align instructions in method
     instrDescAlign* emitAlignLast;                                      // last align instruction in method
     unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize); // Get the smallest loop size
-    void emitSetLoopBackEdge(insGroup* dstIG);
+    void emitLoopAlignment();
+    void emitSetLoopBackEdge(BasicBlock* loopTopBlock);
     void     emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments
     unsigned emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool displayAlignmentDetails));
 #endif
@@ -1932,7 +1934,7 @@ class emitter
     instrDescJmp* emitAllocInstrJmp()
     {
 #if EMITTER_STATS
-        emitTotalIDescJmpCnt++;
+        emitTotalDescAlignCnt++;
 #endif // EMITTER_STATS
         return (instrDescJmp*)emitAllocAnyInstr(sizeof(instrDescJmp), EA_1BYTE);
     }
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index aefdb0bc82b097..afd73fd55b59a8 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2661,31 +2661,35 @@ emitter::instrDesc* emitter::emitNewInstrAmdCns(emitAttr size, ssize_t dsp, int
 //-----------------------------------------------------------------------------
 //
 //  The next instruction will be a loop head entry point
-//  So insert a dummy instruction here to ensure that
-//  the x86 I-cache alignment rule is followed.
+//  So insert an alignment instruction here to ensure that
+//  we can properly align the code.
 //
 void emitter::emitLoopAlign(unsigned short paddingBytes)
 {
     /* Insert a pseudo-instruction to ensure that we align
        the next instruction properly */
 
-    paddingBytes       = min(paddingBytes, 15); // We may need to skip up to 15 bytes of code
+    assert(paddingBytes <= MAX_ENCODED_SIZE);
+    paddingBytes       = min(paddingBytes, MAX_ENCODED_SIZE); // We may need to skip up to 15 bytes of code
     instrDescAlign* id = emitNewInstrAlign();
     id->idCodeSize(paddingBytes);
     emitCurIGsize += paddingBytes;
 
     id->idaIG = emitCurIG;
 
-    /* Append this instruction to this IG's jump list */
+    /* Append this instruction to this IG's alignment list */
     id->idaNext        = emitCurIGAlignList;
     emitCurIGAlignList = id;
+
+    /* Record the last IG that has align instruction */
+    emitLastAlignedIgNum = emitCurIG->igNum;
 }
 
 //-----------------------------------------------------------------------------
 //
 //  The next instruction will be a loop head entry point
-//  So insert a dummy instruction here to ensure that
-//  the x86 I-cache alignment rule is followed.
+//  So insert alignment instruction(s) here to ensure that
+//  we can properly align the code.
 //
 //  This emits more than one `INS_align` instruction depending on the
 //  alignmentBoundary parameter.
@@ -2693,10 +2697,10 @@ void emitter::emitLoopAlign(unsigned short paddingBytes)
 void emitter::emitLongLoopAlign(unsigned short alignmentBoundary)
 {
     unsigned short nPaddingBytes    = alignmentBoundary - 1;
-    unsigned short nAlignInstr      = (nPaddingBytes + (15 - 1)) / 15;
+    unsigned short nAlignInstr      = (nPaddingBytes + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE;
     unsigned short instrDescSize    = nAlignInstr * sizeof(instrDescAlign);
-    unsigned short insAlignCount    = nPaddingBytes / 15;
-    unsigned short lastInsAlignSize = nPaddingBytes % 15;
+    unsigned short insAlignCount    = nPaddingBytes / MAX_ENCODED_SIZE;
+    unsigned short lastInsAlignSize = nPaddingBytes % MAX_ENCODED_SIZE;
 
     // Ensure that all align instructions fall in same IG.
     if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp)
@@ -2722,7 +2726,7 @@ void emitter::emitLongLoopAlign(unsigned short alignmentBoundary)
 
 void emitter::emitIns_Nop(unsigned size)
 {
-    assert(size <= 15);
+    assert(size <= MAX_ENCODED_SIZE);
 
     instrDesc* id = emitNewInstr();
     id->idIns(INS_nop);
@@ -9390,7 +9394,7 @@ static BYTE* emitOutputNOP(BYTE* dst, size_t nBytes)
 //
 // Notes:
 //   Amount of padding needed to align the loop is already calculated. This
-//   method extracts that information and insert those many NOP.
+//   method extracts that information and inserts suitable NOP instructions.
 //
 BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst)
 {
@@ -9411,8 +9415,10 @@ BYTE* emitter::emitOutputAlign(insGroup* ig, instrDesc* id, BYTE* dst)
     unsigned paddingNeeded           = emitCalculatePaddingForLoopAlignment(ig, (size_t)dst, displayAlignmentDetails);
 
     // For non-adaptive, padding size is spread in multiple instructions, so don't bother checking
-    // unless non-adaptive approach is ON by default
-    assert((paddingToAdd == paddingNeeded) || !emitComp->opts.compJitAlignLoopAdaptive);
+    if (emitComp->opts.compJitAlignLoopAdaptive)
+    {
+        assert(paddingToAdd == paddingNeeded);
+    }
 #endif
 
     return emitOutputNOP(dst, paddingToAdd);
@@ -12491,7 +12497,8 @@ BYTE* emitter::emitOutputLJ(insGroup* ig, BYTE* dst, instrDesc* i)
 #ifdef DEBUG
             if (emitComp->verbose)
             {
-                printf("; NOTE: size of jump [%08X] mis-predicted by %d bytes\n", emitComp->dspPtr(id), (id->idCodeSize() - JMP_SIZE_SMALL));
+                printf("; NOTE: size of jump [%08X] mis-predicted by %d bytes\n", emitComp->dspPtr(id),
+                       (id->idCodeSize() - JMP_SIZE_SMALL));
             }
 #endif
         }
@@ -12699,6 +12706,14 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 {
                     dst = emitOutputAlign(ig, id, dst);
                 }
+#ifdef DEBUG
+                else
+                {
+                    // If the IG is not marked as need alignment, then the code size
+                    // should be zero i.e. no padding needed.
+                    assert(id->idCodeSize() == 0);
+                }
+#endif
                 break;
             }
 
@@ -13740,7 +13755,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
     }
 
     // Make sure we set the instruction descriptor size correctly
-    assert((sz == emitSizeOfInsDsc(id)) || (ins == INS_align));
+    assert(sz == emitSizeOfInsDsc(id));
+// assert((sz == emitSizeOfInsDsc(id)) || (ins == INS_align));
 
 #if !FEATURE_FIXED_OUT_ARGS
     bool updateStackLevel = !emitIGisInProlog(ig) && !emitIGisInEpilog(ig);
@@ -13816,6 +13832,12 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
             assert(id->idIns() != INS_align);
             JITDUMP("Added over-estimation compensation: %d\n", diff);
 
+            if (emitComp->opts.disAsm)
+            {
+                emitDispInsAddr(dst);
+                printf("\t\t  ;; NOP compensation instructions of %d bytes.\n", diff);
+            }
+
             dst = emitOutputNOP(dst, diff);
 
             // since we compensated the over-estimation, revert the offsAdj that
diff --git a/src/coreclr/jit/jitee.h b/src/coreclr/jit/jitee.h
index 964fd0f0c70898..ad6d88a5155c4f 100644
--- a/src/coreclr/jit/jitee.h
+++ b/src/coreclr/jit/jitee.h
@@ -63,6 +63,7 @@ class JitFlags
         JIT_FLAG_BBINSTR                 = 29, // Collect basic block profile information
         JIT_FLAG_BBOPT                   = 30, // Optimize method based on profile information
         JIT_FLAG_FRAMED                  = 31, // All methods have an EBP frame
+        JIT_FLAG_UNUSED35                = 32,
         JIT_FLAG_PUBLISH_SECRET_PARAM    = 33, // JIT must place stub secret param into local 0.  (used by IL stubs)
         JIT_FLAG_UNUSED12                = 34,
         JIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 9345be1929a1d3..fe5e1180766783 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2595,7 +2595,7 @@ void Compiler::optIdentifyLoopsForAlignment()
 
             // An innerloop candidate that might need alignment
             if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) &&
-                opts.compJitAlignLoopMinBlockWeight <= first->getBBWeight(this))
+                first->getBBWeight(this) >= opts.compJitAlignLoopMinBlockWeight)
             {
                 first->bbFlags |= BBF_LOOP_ALIGN;
                 JITDUMP("L%02u that starts at " FMT_BB " needs alignment.\n", loopInd, first->bbNum);
@@ -8007,26 +8007,25 @@ bool Compiler::optComputeLoopSideEffectsOfBlock(BasicBlock* blk)
 // Marks the containsCall information to "lnum" and any parent loops.
 void Compiler::AddContainsCallAllContainingLoops(unsigned lnum)
 {
-    unsigned nestedLoopNum = lnum;
-    assert(0 <= lnum && lnum < optLoopCount);
-    while (lnum != BasicBlock::NOT_IN_LOOP)
-    {
-        optLoopTable[lnum].lpContainsCall = true;
-        lnum                              = optLoopTable[lnum].lpParent;
-    }
 
 #ifdef FEATURE_LOOP_ALIGN
     // If this is the inner most loop, reset the LOOP_ALIGN flag
     // because a loop having call will not likely to benefit from
     // alignment
-    if (optLoopTable[nestedLoopNum].lpChild == BasicBlock::NOT_IN_LOOP)
+    if (optLoopTable[lnum].lpChild == BasicBlock::NOT_IN_LOOP)
     {
-        BasicBlock* first = optLoopTable[nestedLoopNum].lpFirst;
+        BasicBlock* first = optLoopTable[lnum].lpFirst;
         first->bbFlags &= ~BBF_LOOP_ALIGN;
-        JITDUMP("Skip alignment for L%02u that starts at " FMT_BB " because loop has a call.\n", nestedLoopNum,
-                first->bbNum);
+        JITDUMP("Skip alignment for L%02u that starts at " FMT_BB " because loop has a call.\n", lnum, first->bbNum);
     }
 #endif
+
+    assert(0 <= lnum && lnum < optLoopCount);
+    while (lnum != BasicBlock::NOT_IN_LOOP)
+    {
+        optLoopTable[lnum].lpContainsCall = true;
+        lnum                              = optLoopTable[lnum].lpParent;
+    }
 }
 
 // Adds the variable liveness information for 'blk' to 'this' LoopDsc
diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs
index 7b3a08b5e47dba..49b74e401cc430 100644
--- a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs
+++ b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs
@@ -1307,6 +1307,7 @@ public enum CorJitFlag : uint
         CORJIT_FLAG_BBINSTR = 29, // Collect basic block profile information
         CORJIT_FLAG_BBOPT = 30, // Optimize method based on profile information
         CORJIT_FLAG_FRAMED = 31, // All methods have an EBP frame
+        CORJIT_FLAG_UNUSED10 = 32,
         CORJIT_FLAG_PUBLISH_SECRET_PARAM = 33, // JIT must place stub secret param into local 0.  (used by IL stubs)
         CORJIT_FLAG_UNUSED8 = 34,
         CORJIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background

From dae9749c18aec0ff9f7cd8da019dcab4ed5350d8 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Thu, 17 Dec 2020 19:17:20 -0800
Subject: [PATCH 46/59] missing check

---
 src/coreclr/jit/emit.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 42d1001acc85d0..486de0742927f1 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -4681,7 +4681,7 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize)
 //
 void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock)
 {
-    insGroup* dstIG = (insGroup*)loopTopBlock->bbJumpDest->bbEmitCookie;
+    insGroup* dstIG = (insGroup*)loopTopBlock->bbEmitCookie;
 
     // With (dstIG != nullptr), ensure that only back edges are tracked.
     // If there is forward jump, dstIG is not yet generated.

From a153742f29b86f78342acbd8a4033e2a26efdec3 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 18 Dec 2020 09:52:05 -0800
Subject: [PATCH 47/59] Mark the last align IG the one that has non-zero
 padding

---
 src/coreclr/jit/emit.cpp      | 8 +++++++-
 src/coreclr/jit/emitxarch.cpp | 3 ---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 486de0742927f1..8155a099cbb79e 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -882,7 +882,6 @@ insGroup* emitter::emitSavIG(bool emitAdd)
         }
 
         emitAlignLast = last;
-        assert(emitAlignLast->idaIG->igNum == emitLastAlignedIgNum);
     }
 
 #endif
@@ -4851,6 +4850,13 @@ void emitter::emitLoopAlignAdjustments()
             JITDUMP("Adjusted alignment of G_M%03u_IG%02u from %02d to %02d\n", emitComp->compMethodID, alignIG->igNum,
                     estimatedPaddingNeeded, actualPaddingNeeded);
         }
+
+        if (actualPaddingNeeded > 0)
+        {
+            // Record the last IG that has align instruction. No overestimation
+            // adjustment will be done after emitLastAlignedIgNum.
+            emitLastAlignedIgNum = alignIG->igNum;
+        }
     }
 
     // Do adjustments of remaining IGs
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index afd73fd55b59a8..197af9425e533c 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -2680,9 +2680,6 @@ void emitter::emitLoopAlign(unsigned short paddingBytes)
     /* Append this instruction to this IG's alignment list */
     id->idaNext        = emitCurIGAlignList;
     emitCurIGAlignList = id;
-
-    /* Record the last IG that has align instruction */
-    emitLastAlignedIgNum = emitCurIG->igNum;
 }
 
 //-----------------------------------------------------------------------------

From ef02fbb360e8754327c1cc7831d5a50d8eaef4d6 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 18 Dec 2020 18:01:09 -0800
Subject: [PATCH 48/59] More review comments

---
 src/coreclr/jit/compiler.cpp  | 10 ++--
 src/coreclr/jit/compiler.h    |  3 ++
 src/coreclr/jit/emit.cpp      | 86 +++++++++++++++--------------------
 src/coreclr/jit/emitxarch.cpp |  1 -
 4 files changed, 46 insertions(+), 54 deletions(-)

diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
index 1d13bd85488035..7f53629f25496f 100644
--- a/src/coreclr/jit/compiler.cpp
+++ b/src/coreclr/jit/compiler.cpp
@@ -2624,14 +2624,18 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     opts.compJitAlignLoopMaxCodeSize = (unsigned short)JitConfig.JitAlignLoopMaxCodeSize();
 #else
     opts.compJitAlignLoopAdaptive       = true;
+    opts.compJitAlignLoopBoundary       = DEFAULT_ALIGN_LOOP_BOUNDARY;
     opts.compJitAlignLoopMinBlockWeight = DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT;
 #endif
-
-    // Adaptive alignment works on 32B boundary
     if (opts.compJitAlignLoopAdaptive)
     {
-        opts.compJitAlignLoopBoundary = DEFAULT_ALIGN_LOOP_BOUNDARY;
+        opts.compJitAlignPaddingLimit = (opts.compJitAlignLoopBoundary >> 1) - 1;
     }
+    else
+    {
+        opts.compJitAlignPaddingLimit = opts.compJitAlignLoopBoundary - 1;
+    }
+
     assert(isPow2(opts.compJitAlignLoopBoundary));
 
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index b148d4cf054d52..b6aafde066a69b 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -9069,6 +9069,9 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
         // be done. By default, 32B.
         unsigned short compJitAlignLoopBoundary;
 
+        // Padding limit to align a loop.
+        unsigned short compJitAlignPaddingLimit;
+
         // If set, perform adaptive loop alignment that limits number of padding based on loop size.
         bool compJitAlignLoopAdaptive;
 
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 8155a099cbb79e..e88994f18fe89f 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -4612,6 +4612,11 @@ void emitter::emitJumpDistBind()
 
 #ifdef FEATURE_LOOP_ALIGN
 
+//-----------------------------------------------------------------------------
+// emitLoopAlignment: Insert an align instruction at the end of emitCurIG and
+//                    mark it as IGF_LOOP_ALIGN to indicate that next IG  is a
+//                    loop needing alignment.
+// 
 void emitter::emitLoopAlignment()
 {
     if ((emitComp->opts.compJitAlignLoopBoundary > 16) && (!emitComp->opts.compJitAlignLoopAdaptive))
@@ -4632,7 +4637,15 @@ void emitter::emitLoopAlignment()
 }
 
 //-----------------------------------------------------------------------------
-//  For loopHeaderIg, find the size of the smallest possible loop that doesn't exceed maxLoopSize.
+//  getLoopSize: Starting from loopHeaderIg, find the size of the smallest possible loop
+//               such that it doesn't exceed the maxLoopSize.
+//
+//  Arguments:
+//       igLoopHeader - The header IG of a loop
+//       maxLoopSize  - Maximum loop size. If the loop is bigger than this value, we will just
+//                      return this value.
+//
+//  Returns:  size of a loop in bytes.
 //
 unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize)
 {
@@ -4649,17 +4662,7 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize)
 
             // In such cases, the current loop size should exclude the align instruction size reserved for
             // next loop.
-            unsigned maxPaddingAllowed;
-            if (emitComp->opts.compJitAlignLoopAdaptive)
-            {
-                maxPaddingAllowed = (emitComp->opts.compJitAlignLoopBoundary >> 1) - 1;
-            }
-            else
-            {
-                maxPaddingAllowed = emitComp->opts.compJitAlignLoopBoundary - 1;
-            }
-
-            loopSize -= maxPaddingAllowed;
+            loopSize -= emitComp->opts.compJitAlignPaddingLimit;
         }
         if ((igInLoop->igLoopBackEdge == igLoopHeader) || (loopSize > maxLoopSize))
         {
@@ -4675,7 +4678,7 @@ unsigned emitter::getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize)
 //                       if currIG has back-edge to dstIG.
 //
 // Notes:
-//    If the current loop covers a loop that is already marked as align, then remove
+//    If the current loop encloses a loop that is already marked as align, then remove
 //    the alignment flag present on IG before dstIG.
 //
 void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock)
@@ -4724,7 +4727,7 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock)
                 alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN;
             }
 
-            JITDUMP("** Skip alignment for loop IG%02u ~ IG%02u, because it covers an aligned loop IG%02u ~ IG%02u.\n",
+            JITDUMP("** Skip alignment for loop IG%02u ~ IG%02u, because it encloses an aligned loop IG%02u ~ IG%02u.\n",
                     currLoopStart, currLoopEnd, emitLastInnerLoopStartIgNum, emitLastInnerLoopEndIgNum);
         }
     }
@@ -4749,21 +4752,13 @@ void emitter::emitLoopAlignAdjustments()
 
     JITDUMP("*************** In emitLoopAlignAdjustments()\n");
 
-    unsigned short estimatedPaddingNeeded, alignmentBoundary = emitComp->opts.compJitAlignLoopBoundary;
-    unsigned       maxLoopSize = 0;
+    unsigned short estimatedPaddingNeeded = emitComp->opts.compJitAlignPaddingLimit;
+    unsigned short alignmentBoundary      = emitComp->opts.compJitAlignLoopBoundary;
 
     if (emitComp->opts.compJitAlignLoopAdaptive)
     {
         // For adaptive, adjust the loop size depending on the alignment boundary
         int maxBlocksAllowedForLoop = genLog2((unsigned)alignmentBoundary) - 1;
-        maxLoopSize                 = alignmentBoundary * maxBlocksAllowedForLoop;
-        estimatedPaddingNeeded      = (alignmentBoundary >> 1) - 1;
-    }
-    else
-    {
-        // For non-adaptive, just take whatever is supplied using COMPlus_ variables
-        maxLoopSize            = emitComp->opts.compJitAlignLoopMaxCodeSize;
-        estimatedPaddingNeeded = alignmentBoundary - 1;
     }
 
     unsigned        alignBytesRemoved = 0;
@@ -4771,29 +4766,19 @@ void emitter::emitLoopAlignAdjustments()
     unsigned        loopIGOffset      = 0;
     instrDescAlign* alignInstr        = emitAlignList;
 
-    // track the IG that was adjusted so we can update the offsets
-    insGroup* lastIGAdj = emitAlignList->idaIG;
-
     for (; alignInstr != nullptr; alignInstr = alignInstr->idaNext)
     {
         assert(alignInstr->idIns() == INS_align);
 
         insGroup* alignIG = alignInstr->idaIG;
 
-        // Adjust offsets of all IGs until the current IG
-        while (lastIGAdj->igNum <= alignIG->igNum)
-        {
-            lastIGAdj->igOffs -= alignBytesRemoved;
-            lastIGAdj = lastIGAdj->igNext;
-        }
-
         loopIGOffset = alignIG->igOffs + alignIG->igSize;
 
         // igSize also includes INS_align instruction, take it off.
         loopIGOffset -= estimatedPaddingNeeded;
 
         // IG can be marked as not needing alignment if during setting igLoopBackEdge, it is detected
-        // that the igLoopBackEdge covers an IG that is marked for alignment.
+        // that the igLoopBackEdge encloses an IG that is marked for alignment.
         unsigned actualPaddingNeeded =
             alignIG->isLoopAlign() ? emitCalculatePaddingForLoopAlignment(alignIG, loopIGOffset DEBUG_ARG(false)) : 0;
 
@@ -4816,7 +4801,7 @@ void emitter::emitLoopAlignAdjustments()
 
             if (emitComp->opts.compJitAlignLoopAdaptive)
             {
-                assert(actualPaddingNeeded < 15);
+                assert(actualPaddingNeeded < MAX_ENCODED_SIZE);
                 alignInstr->idCodeSize(actualPaddingNeeded);
             }
             else
@@ -4825,14 +4810,14 @@ void emitter::emitLoopAlignAdjustments()
 
 #ifdef DEBUG
 
-                int instrAdjusted = (alignmentBoundary + 14) / 15;
+                int instrAdjusted = (alignmentBoundary + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE;
 #endif
                 // Adjust the padding amount in all align instructions in this IG
                 instrDescAlign *alignInstrToAdj = alignInstr, *prevAlignInstr = nullptr;
                 for (; alignInstrToAdj != nullptr && alignInstrToAdj->idaIG == alignInstr->idaIG;
                      alignInstrToAdj = alignInstrToAdj->idaNext)
                 {
-                    unsigned newPadding = min(paddingToAdj, 15);
+                    unsigned newPadding = min(paddingToAdj, MAX_ENCODED_SIZE);
                     alignInstrToAdj->idCodeSize(newPadding);
                     paddingToAdj -= newPadding;
                     prevAlignInstr = alignInstrToAdj;
@@ -4851,6 +4836,16 @@ void emitter::emitLoopAlignAdjustments()
                     estimatedPaddingNeeded, actualPaddingNeeded);
         }
 
+        // Adjust the offset of all IGs starting from next IG until we reach the IG having the next
+        // align instruction or the end of IG list.
+        insGroup* adjOffIG      = alignIG->igNext;
+        insGroup* adjOffUptoIG  = alignInstr->idaNext != nullptr ? alignInstr->idaNext->idaIG : emitIGlast;
+        while ((adjOffIG != nullptr) && (adjOffIG->igNum <= adjOffUptoIG->igNum))
+        {
+            adjOffIG->igOffs -= alignBytesRemoved;
+            adjOffIG = adjOffIG->igNext;
+        }
+
         if (actualPaddingNeeded > 0)
         {
             // Record the last IG that has align instruction. No overestimation
@@ -4859,13 +4854,6 @@ void emitter::emitLoopAlignAdjustments()
         }
     }
 
-    // Do adjustments of remaining IGs
-    while (lastIGAdj != nullptr)
-    {
-        lastIGAdj->igOffs -= alignBytesRemoved;
-        lastIGAdj = lastIGAdj->igNext;
-    }
-
 #ifdef DEBUG
     emitCheckIGoffsets();
 #endif
@@ -5935,11 +5923,6 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
     {
         printf("\n");
     }
-
-    if (emitComp->verbose)
-    {
-        printf("Allocated method code size = %4u , actual size = %4u\n", emitTotalCodeSize, cp - codeBlock);
-    }
 #endif
 
     unsigned actualCodeSize = emitCurCodeOffs(cp);
@@ -5954,6 +5937,9 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
     // If you add this padding during the emitIGlist loop, then it will
     // emit offsets after the loop with wrong value (for example for GC ref variables).
     unsigned unusedSize = emitTotalCodeSize - actualCodeSize;
+
+    JITDUMP("Allocated method code size = %4u , actual size = %4u, unused size = %4u\n", emitTotalCodeSize, actualCodeSize, unusedSize);
+
     for (unsigned i = 0; i < unusedSize; ++i)
     {
         *cp++ = DEFAULT_CODE_BUFFER_INIT;
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 197af9425e533c..5c7941b1515886 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -13753,7 +13753,6 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
 
     // Make sure we set the instruction descriptor size correctly
     assert(sz == emitSizeOfInsDsc(id));
-// assert((sz == emitSizeOfInsDsc(id)) || (ins == INS_align));
 
 #if !FEATURE_FIXED_OUT_ARGS
     bool updateStackLevel = !emitIGisInProlog(ig) && !emitIGisInEpilog(ig);

From e7e0d68d0397caeb2b983a7cdcfaf897b78c3af4 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 18 Dec 2020 19:45:15 -0800
Subject: [PATCH 49/59] Propagate BBF_LOOP_ALIGN for compacting blocks

---
 src/coreclr/jit/flowgraph.cpp | 9 +++++++++
 src/coreclr/jit/optimizer.cpp | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
index 8ce88cb090ecb1..7f903c9db68870 100644
--- a/src/coreclr/jit/flowgraph.cpp
+++ b/src/coreclr/jit/flowgraph.cpp
@@ -10946,6 +10946,15 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext)
             break;
     }
 
+    // Add the LOOP_ALIGN flag
+    if (bNext->isLoopAlign())
+    {
+        // Only if the new block is jump target or has label
+        if (((block->bbFlags & BBF_JMP_TARGET) != 0) || ((block->bbFlags & BBF_HAS_LABEL) != 0))
+        {
+            block->bbFlags |= BBF_LOOP_ALIGN;
+        }
+    }
     // If we're collapsing a block created after the dominators are
     // computed, copy block number the block and reuse dominator
     // information from bNext to block.
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index fe5e1180766783..10c70c56184eef 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2598,7 +2598,7 @@ void Compiler::optIdentifyLoopsForAlignment()
                 first->getBBWeight(this) >= opts.compJitAlignLoopMinBlockWeight)
             {
                 first->bbFlags |= BBF_LOOP_ALIGN;
-                JITDUMP("L%02u that starts at " FMT_BB " needs alignment.\n", loopInd, first->bbNum);
+                JITDUMP("L%02u that starts at " FMT_BB " needs alignment, weight=%f.\n", loopInd, first->bbNum, first->getBBWeight(this));
             }
         }
     }

From 4b0e64de1d7c5dd390643e8a8be6cdca8459dd03 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Sun, 20 Dec 2020 01:23:09 -0800
Subject: [PATCH 50/59] Handle ALIGN_LOOP flag for loops that are unrolled

---
 src/coreclr/jit/flowgraph.cpp |  1 +
 src/coreclr/jit/optimizer.cpp | 16 ++++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
index 7f903c9db68870..774e973a345cef 100644
--- a/src/coreclr/jit/flowgraph.cpp
+++ b/src/coreclr/jit/flowgraph.cpp
@@ -10953,6 +10953,7 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext)
         if (((block->bbFlags & BBF_JMP_TARGET) != 0) || ((block->bbFlags & BBF_HAS_LABEL) != 0))
         {
             block->bbFlags |= BBF_LOOP_ALIGN;
+            JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " during compacting.\n", bNext->bbNum, block->bbNum);
         }
     }
     // If we're collapsing a block created after the dominators are
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 10c70c56184eef..def227bcd0f205 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -3784,6 +3784,22 @@ void Compiler::optUnrollLoops()
 #endif
         }
 
+#ifdef FEATURE_LOOP_ALIGN
+        for (block = head->bbNext;; block = block->bbNext)
+        {
+            if (block->isLoopAlign())
+            {
+                block->bbFlags &= ~BBF_LOOP_ALIGN;
+                JITDUMP("Removing align flag from unrolled loop in " FMT_BB "\n", block->bbNum);
+            }
+
+            if (block == bottom)
+            {
+                break;
+            }
+        }
+#endif
+
         /* Create the unrolled loop statement list */
         {
             BlockToBlockMap blockMap(getAllocator());

From 6fddce8f4e793196b511367d907b610b5c0b5bea Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Sun, 20 Dec 2020 01:25:34 -0800
Subject: [PATCH 51/59] jit format

---
 src/coreclr/jit/emit.cpp      | 14 ++++++++------
 src/coreclr/jit/flowgraph.cpp |  3 ++-
 src/coreclr/jit/optimizer.cpp |  3 ++-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index e88994f18fe89f..b6c594673510b5 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -4616,7 +4616,7 @@ void emitter::emitJumpDistBind()
 // emitLoopAlignment: Insert an align instruction at the end of emitCurIG and
 //                    mark it as IGF_LOOP_ALIGN to indicate that next IG  is a
 //                    loop needing alignment.
-// 
+//
 void emitter::emitLoopAlignment()
 {
     if ((emitComp->opts.compJitAlignLoopBoundary > 16) && (!emitComp->opts.compJitAlignLoopAdaptive))
@@ -4727,8 +4727,9 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock)
                 alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN;
             }
 
-            JITDUMP("** Skip alignment for loop IG%02u ~ IG%02u, because it encloses an aligned loop IG%02u ~ IG%02u.\n",
-                    currLoopStart, currLoopEnd, emitLastInnerLoopStartIgNum, emitLastInnerLoopEndIgNum);
+            JITDUMP(
+                "** Skip alignment for loop IG%02u ~ IG%02u, because it encloses an aligned loop IG%02u ~ IG%02u.\n",
+                currLoopStart, currLoopEnd, emitLastInnerLoopStartIgNum, emitLastInnerLoopEndIgNum);
         }
     }
 }
@@ -4838,8 +4839,8 @@ void emitter::emitLoopAlignAdjustments()
 
         // Adjust the offset of all IGs starting from next IG until we reach the IG having the next
         // align instruction or the end of IG list.
-        insGroup* adjOffIG      = alignIG->igNext;
-        insGroup* adjOffUptoIG  = alignInstr->idaNext != nullptr ? alignInstr->idaNext->idaIG : emitIGlast;
+        insGroup* adjOffIG     = alignIG->igNext;
+        insGroup* adjOffUptoIG = alignInstr->idaNext != nullptr ? alignInstr->idaNext->idaIG : emitIGlast;
         while ((adjOffIG != nullptr) && (adjOffIG->igNum <= adjOffUptoIG->igNum))
         {
             adjOffIG->igOffs -= alignBytesRemoved;
@@ -5938,7 +5939,8 @@ unsigned emitter::emitEndCodeGen(Compiler* comp,
     // emit offsets after the loop with wrong value (for example for GC ref variables).
     unsigned unusedSize = emitTotalCodeSize - actualCodeSize;
 
-    JITDUMP("Allocated method code size = %4u , actual size = %4u, unused size = %4u\n", emitTotalCodeSize, actualCodeSize, unusedSize);
+    JITDUMP("Allocated method code size = %4u , actual size = %4u, unused size = %4u\n", emitTotalCodeSize,
+            actualCodeSize, unusedSize);
 
     for (unsigned i = 0; i < unusedSize; ++i)
     {
diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
index 774e973a345cef..7d34d5ec821f6f 100644
--- a/src/coreclr/jit/flowgraph.cpp
+++ b/src/coreclr/jit/flowgraph.cpp
@@ -10953,7 +10953,8 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext)
         if (((block->bbFlags & BBF_JMP_TARGET) != 0) || ((block->bbFlags & BBF_HAS_LABEL) != 0))
         {
             block->bbFlags |= BBF_LOOP_ALIGN;
-            JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " during compacting.\n", bNext->bbNum, block->bbNum);
+            JITDUMP("Propagating LOOP_ALIGN flag from " FMT_BB " to " FMT_BB " during compacting.\n", bNext->bbNum,
+                    block->bbNum);
         }
     }
     // If we're collapsing a block created after the dominators are
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index def227bcd0f205..41c708520e99e2 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2598,7 +2598,8 @@ void Compiler::optIdentifyLoopsForAlignment()
                 first->getBBWeight(this) >= opts.compJitAlignLoopMinBlockWeight)
             {
                 first->bbFlags |= BBF_LOOP_ALIGN;
-                JITDUMP("L%02u that starts at " FMT_BB " needs alignment, weight=%f.\n", loopInd, first->bbNum, first->getBBWeight(this));
+                JITDUMP("L%02u that starts at " FMT_BB " needs alignment, weight=%f.\n", loopInd, first->bbNum,
+                        first->getBBWeight(this));
             }
         }
     }

From bad5685b41c182902c5f34a8b4233673f9e70f51 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Mon, 21 Dec 2020 15:37:13 -0800
Subject: [PATCH 52/59] Loop size upto last back-edge instead of first
 back-edge

---
 src/coreclr/jit/emit.cpp | 2 +-
 src/coreclr/jit/emit.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index b6c594673510b5..146c52963b9ed4 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -4691,7 +4691,7 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock)
     // We don't rely on (block->bbJumpDest->bbNum <= block->bbNum) because the basic
     // block numbering is not guaranteed to be sequential.
 
-    if ((dstIG != nullptr) && (dstIG->igNum <= emitCurIG->igNum) && (emitCurIG->igLoopBackEdge == nullptr))
+    if ((dstIG != nullptr) && (dstIG->igNum <= emitCurIG->igNum))
     {
         unsigned currLoopStart = dstIG->igNum;
         unsigned currLoopEnd   = emitCurIG->igNum;
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 4f8fd00b931646..98f6167a29cb80 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -250,7 +250,7 @@ struct insGroup
     unsigned int   igFuncIdx; // Which function/funclet does this belong to? (Index into Compiler::compFuncInfos array.)
     unsigned short igFlags;   // see IGF_xxx below
     unsigned short igSize;    // # of bytes of code in this group
-    insGroup*      igLoopBackEdge; // "first" back-edge that branches back to an aligned loop head.
+    insGroup*      igLoopBackEdge; // "last" back-edge that branches back to an aligned loop head.
 
 #define IGF_GC_VARS 0x0001    // new set of live GC ref variables
 #define IGF_BYREF_REGS 0x0002 // new set of live by-ref registers

From 8c30a967276e921fb63e611741fdd644d2a7910a Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Mon, 21 Dec 2020 15:37:53 -0800
Subject: [PATCH 53/59] Take loop weight in consideration

---
 src/coreclr/jit/compiler.h    | 2 +-
 src/coreclr/jit/optimizer.cpp | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index b6aafde066a69b..3d5156aa474de5 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -9042,7 +9042,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 // based on experimenting with various benchmarks.
 
 // Default minimum loop block weight required to enable loop alignment.
-#define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 10
+#define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 8
 
 // By default a loop will be aligned at 32B address boundary to get better
 // performance as per architecture manuals.
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 41c708520e99e2..2c9dd3e1e7d42f 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -4472,10 +4472,6 @@ void Compiler::optOptimizeLoops()
 
         optFindNaturalLoops();
 
-        // Check if any of the loops need alignment
-
-        optIdentifyLoopsForAlignment();
-
         unsigned loopNum = 0;
 
         /* Iterate over the flow graph, marking all loops */
@@ -4554,6 +4550,10 @@ void Compiler::optOptimizeLoops()
             }
         }
 
+        // Check if any of the loops need alignment
+
+        optIdentifyLoopsForAlignment();
+
 #if COUNT_LOOPS
         totalUnnatLoopCount += loopNum;
 #endif

From f32f560cd1b1f99b4c765582a5e3f3151ece7281 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Mon, 4 Jan 2021 12:35:08 -0800
Subject: [PATCH 54/59] remove align flag if loop is no longer valid

---
 src/coreclr/jit/morph.cpp     | 6 ++++++
 src/coreclr/jit/optimizer.cpp | 5 +++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index b72d32e5ce2eab..f65f0edfd022a9 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -16317,6 +16317,12 @@ bool Compiler::fgFoldConditional(BasicBlock* block)
                          * Remove the loop from the table */
 
                         optLoopTable[loopNum].lpFlags |= LPFLG_REMOVED;
+#ifdef FEATURE_LOOP_ALIGN
+                        optLoopTable[loopNum].lpFirst->bbFlags &= ~BBF_LOOP_ALIGN;
+                        JITDUMP("Removing LOOP_ALIGN flag from bogus loop in " FMT_BB "\n",
+                                optLoopTable[loopNum].lpFirst->bbNum);
+#endif
+
 #ifdef DEBUG
                         if (verbose)
                         {
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 2c9dd3e1e7d42f..8e31e0d2541c13 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -3791,7 +3791,7 @@ void Compiler::optUnrollLoops()
             if (block->isLoopAlign())
             {
                 block->bbFlags &= ~BBF_LOOP_ALIGN;
-                JITDUMP("Removing align flag from unrolled loop in " FMT_BB "\n", block->bbNum);
+                JITDUMP("Removing LOOP_ALIGN flag from unrolled loop in " FMT_BB "\n", block->bbNum);
             }
 
             if (block == bottom)
@@ -8033,7 +8033,8 @@ void Compiler::AddContainsCallAllContainingLoops(unsigned lnum)
     {
         BasicBlock* first = optLoopTable[lnum].lpFirst;
         first->bbFlags &= ~BBF_LOOP_ALIGN;
-        JITDUMP("Skip alignment for L%02u that starts at " FMT_BB " because loop has a call.\n", lnum, first->bbNum);
+        JITDUMP("Removing LOOP_ALIGN flag for L%02u that starts at " FMT_BB " because loop has a call.\n", lnum,
+                first->bbNum);
     }
 #endif
 

From 23177b0b3b20f36533264d1de04121a601ebadca Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 6 Jan 2021 14:25:02 -0800
Subject: [PATCH 55/59] Adjust loop block weight to 4 instead of 8

---
 src/coreclr/jit/compiler.h    |  2 +-
 src/coreclr/jit/optimizer.cpp | 17 ++++++++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 3d5156aa474de5..9af31fdf03a071 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -9042,7 +9042,7 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 // based on experimenting with various benchmarks.
 
 // Default minimum loop block weight required to enable loop alignment.
-#define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 8
+#define DEFAULT_ALIGN_LOOP_MIN_BLOCK_WEIGHT 4
 
 // By default a loop will be aligned at 32B address boundary to get better
 // performance as per architecture manuals.
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 8e31e0d2541c13..3f58496de28d7e 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2594,12 +2594,19 @@ void Compiler::optIdentifyLoopsForAlignment()
             BasicBlock* first = optLoopTable[loopInd].lpFirst;
 
             // An innerloop candidate that might need alignment
-            if ((optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP) &&
-                first->getBBWeight(this) >= opts.compJitAlignLoopMinBlockWeight)
+            if (optLoopTable[loopInd].lpChild == BasicBlock::NOT_IN_LOOP)
             {
-                first->bbFlags |= BBF_LOOP_ALIGN;
-                JITDUMP("L%02u that starts at " FMT_BB " needs alignment, weight=%f.\n", loopInd, first->bbNum,
-                        first->getBBWeight(this));
+                if (first->getBBWeight(this) >= (opts.compJitAlignLoopMinBlockWeight * BB_UNITY_WEIGHT))
+                {
+                    first->bbFlags |= BBF_LOOP_ALIGN;
+                    JITDUMP("L%02u that starts at " FMT_BB " needs alignment, weight=%f.\n", loopInd, first->bbNum,
+                            first->getBBWeight(this));
+                }
+                else
+                {
+                    JITDUMP("Skip alignment for L%02u that starts at " FMT_BB " weight=%f.\n", loopInd, first->bbNum,
+                            first->getBBWeight(this));
+                }
             }
         }
     }

From 9f3cb2da2503313de7bb7985f805089e2013311f Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 6 Jan 2021 15:41:55 -0800
Subject: [PATCH 56/59] missing space after rebase

---
 src/coreclr/jit/flowgraph.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
index 7d34d5ec821f6f..eb9345fb6c3556 100644
--- a/src/coreclr/jit/flowgraph.cpp
+++ b/src/coreclr/jit/flowgraph.cpp
@@ -10957,6 +10957,7 @@ void Compiler::fgCompactBlocks(BasicBlock* block, BasicBlock* bNext)
                     block->bbNum);
         }
     }
+
     // If we're collapsing a block created after the dominators are
     // computed, copy block number the block and reuse dominator
     // information from bNext to block.

From 74620ed16354b1a718f9fa6f6f712a7d87fac11a Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Wed, 6 Jan 2021 16:23:11 -0800
Subject: [PATCH 57/59] fix the enum values after rebase

---
 src/coreclr/inc/corjitflags.h                 | 50 +++++++++----------
 src/coreclr/inc/jiteeversionguid.h            | 10 ++--
 src/coreclr/jit/flowgraph.cpp                 |  6 +--
 src/coreclr/jit/jitee.h                       | 50 +++++++++----------
 .../tools/Common/JitInterface/CorInfoTypes.cs |  6 +--
 5 files changed, 61 insertions(+), 61 deletions(-)

diff --git a/src/coreclr/inc/corjitflags.h b/src/coreclr/inc/corjitflags.h
index 6add94e0c4b357..5cea8a224c609d 100644
--- a/src/coreclr/inc/corjitflags.h
+++ b/src/coreclr/inc/corjitflags.h
@@ -79,45 +79,45 @@ class CORJIT_FLAGS
         CORJIT_FLAG_BBINSTR                 = 29, // Collect basic block profile information
         CORJIT_FLAG_BBOPT                   = 30, // Optimize method based on profile information
         CORJIT_FLAG_FRAMED                  = 31, // All methods have an EBP frame
-        CORJIT_FLAG_UNUSED35                = 32,
+        CORJIT_FLAG_UNUSED12                = 32,
         CORJIT_FLAG_PUBLISH_SECRET_PARAM    = 33, // JIT must place stub secret param into local 0.  (used by IL stubs)
-        CORJIT_FLAG_UNUSED12                = 34,
+        CORJIT_FLAG_UNUSED13                = 34,
         CORJIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background
         CORJIT_FLAG_USE_PINVOKE_HELPERS     = 36, // The JIT should use the PINVOKE_{BEGIN,END} helpers instead of emitting inline transitions
         CORJIT_FLAG_REVERSE_PINVOKE         = 37, // The JIT should insert REVERSE_PINVOKE_{ENTER,EXIT} helpers into method prolog/epilog
-        CORJIT_FLAG_UNUSED13                = 38,
+        CORJIT_FLAG_UNUSED14                = 38,
         CORJIT_FLAG_TIER0                   = 39, // This is the initial tier for tiered compilation which should generate code as quickly as possible
         CORJIT_FLAG_TIER1                   = 40, // This is the final tier (for now) for tiered compilation which should generate high quality code
 
 #if defined(TARGET_ARM)
         CORJIT_FLAG_RELATIVE_CODE_RELOCS    = 41, // JIT should generate PC-relative address computations instead of EE relocation records
 #else // !defined(TARGET_ARM)
-        CORJIT_FLAG_UNUSED14                = 41,
+        CORJIT_FLAG_UNUSED15                = 41,
 #endif // !defined(TARGET_ARM)
 
         CORJIT_FLAG_NO_INLINING             = 42, // JIT should not inline any called method into this method
 
-        CORJIT_FLAG_UNUSED15                = 43,
-        CORJIT_FLAG_UNUSED16                = 44,
-        CORJIT_FLAG_UNUSED17                = 45,
-        CORJIT_FLAG_UNUSED18                = 46,
-        CORJIT_FLAG_UNUSED19                = 47,
-        CORJIT_FLAG_UNUSED20                = 48,
-        CORJIT_FLAG_UNUSED21                = 49,
-        CORJIT_FLAG_UNUSED22                = 50,
-        CORJIT_FLAG_UNUSED23                = 51,
-        CORJIT_FLAG_UNUSED24                = 52,
-        CORJIT_FLAG_UNUSED25                = 53,
-        CORJIT_FLAG_UNUSED26                = 54,
-        CORJIT_FLAG_UNUSED27                = 55,
-        CORJIT_FLAG_UNUSED28                = 56,
-        CORJIT_FLAG_UNUSED29                = 57,
-        CORJIT_FLAG_UNUSED30                = 58,
-        CORJIT_FLAG_UNUSED31                = 59,
-        CORJIT_FLAG_UNUSED32                = 60,
-        CORJIT_FLAG_UNUSED33                = 61,
-        CORJIT_FLAG_UNUSED34                = 62,
-        CORJIT_FLAG_UNUSED35                = 63
+        CORJIT_FLAG_UNUSED16                = 43,
+        CORJIT_FLAG_UNUSED17                = 44,
+        CORJIT_FLAG_UNUSED18                = 45,
+        CORJIT_FLAG_UNUSED19                = 46,
+        CORJIT_FLAG_UNUSED20                = 47,
+        CORJIT_FLAG_UNUSED21                = 48,
+        CORJIT_FLAG_UNUSED22                = 49,
+        CORJIT_FLAG_UNUSED23                = 50,
+        CORJIT_FLAG_UNUSED24                = 51,
+        CORJIT_FLAG_UNUSED25                = 52,
+        CORJIT_FLAG_UNUSED26                = 53,
+        CORJIT_FLAG_UNUSED27                = 54,
+        CORJIT_FLAG_UNUSED28                = 55,
+        CORJIT_FLAG_UNUSED29                = 56,
+        CORJIT_FLAG_UNUSED30                = 57,
+        CORJIT_FLAG_UNUSED31                = 58,
+        CORJIT_FLAG_UNUSED32                = 59,
+        CORJIT_FLAG_UNUSED33                = 60,
+        CORJIT_FLAG_UNUSED34                = 61,
+        CORJIT_FLAG_UNUSED35                = 62,
+        CORJIT_FLAG_UNUSED36                = 63
     };
 
     CORJIT_FLAGS()
diff --git a/src/coreclr/inc/jiteeversionguid.h b/src/coreclr/inc/jiteeversionguid.h
index 6ee29b5a00fae6..e67969b5222d5a 100644
--- a/src/coreclr/inc/jiteeversionguid.h
+++ b/src/coreclr/inc/jiteeversionguid.h
@@ -31,11 +31,11 @@
 //
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
 
-constexpr GUID JITEEVersionIdentifier = { /* 8e32c24d-62fe-4d78-ae73-eedddb928ee2 */
-    0x8e32c24d,
-    0x62fe,
-    0x4d78,
-    {0xae, 0x73, 0xee, 0xdd, 0xdb, 0x92, 0x8e, 0xe2}
+constexpr GUID JITEEVersionIdentifier = { /* de81f48e-7701-45f2-a91b-1914f88dfd11 */
+    0xde81f48e,
+    0x7701,
+    0x45f2,
+    {0xa9, 0x1b, 0x19, 0x14, 0xf8, 0x8d, 0xfd, 0x11}
 };
 
 //////////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
index eb9345fb6c3556..b0a49030dacd4a 100644
--- a/src/coreclr/jit/flowgraph.cpp
+++ b/src/coreclr/jit/flowgraph.cpp
@@ -9642,9 +9642,9 @@ BasicBlock* Compiler::fgSplitBlockAtEnd(BasicBlock* curr)
     newBlock->bbFlags = curr->bbFlags;
 
     // Remove flags that the new block can't have.
-    newBlock->bbFlags &=
-        ~(BBF_TRY_BEG | BBF_LOOP_HEAD | BBF_LOOP_CALL0 | BBF_LOOP_CALL1 | BBF_HAS_LABEL | BBF_JMP_TARGET |
-          BBF_FUNCLET_BEG | BBF_LOOP_PREHEADER | BBF_KEEP_BBJ_ALWAYS | BBF_PATCHPOINT | BBF_BACKWARD_JUMP_TARGET | BBF_LOOP_ALIGN);
+    newBlock->bbFlags &= ~(BBF_TRY_BEG | BBF_LOOP_HEAD | BBF_LOOP_CALL0 | BBF_LOOP_CALL1 | BBF_HAS_LABEL |
+                           BBF_JMP_TARGET | BBF_FUNCLET_BEG | BBF_LOOP_PREHEADER | BBF_KEEP_BBJ_ALWAYS |
+                           BBF_PATCHPOINT | BBF_BACKWARD_JUMP_TARGET | BBF_LOOP_ALIGN);
 
     // Remove the GC safe bit on the new block. It seems clear that if we split 'curr' at the end,
     // such that all the code is left in 'curr', and 'newBlock' just gets the control flow, then
diff --git a/src/coreclr/jit/jitee.h b/src/coreclr/jit/jitee.h
index ad6d88a5155c4f..6301166e489c0f 100644
--- a/src/coreclr/jit/jitee.h
+++ b/src/coreclr/jit/jitee.h
@@ -63,45 +63,45 @@ class JitFlags
         JIT_FLAG_BBINSTR                 = 29, // Collect basic block profile information
         JIT_FLAG_BBOPT                   = 30, // Optimize method based on profile information
         JIT_FLAG_FRAMED                  = 31, // All methods have an EBP frame
-        JIT_FLAG_UNUSED35                = 32,
+        JIT_FLAG_UNUSED12                = 32,
         JIT_FLAG_PUBLISH_SECRET_PARAM    = 33, // JIT must place stub secret param into local 0.  (used by IL stubs)
-        JIT_FLAG_UNUSED12                = 34,
+        JIT_FLAG_UNUSED13                = 34,
         JIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background
         JIT_FLAG_USE_PINVOKE_HELPERS     = 36, // The JIT should use the PINVOKE_{BEGIN,END} helpers instead of emitting inline transitions
         JIT_FLAG_REVERSE_PINVOKE         = 37, // The JIT should insert REVERSE_PINVOKE_{ENTER,EXIT} helpers into method prolog/epilog
-        JIT_FLAG_UNUSED13                = 38,
+        JIT_FLAG_UNUSED14                = 38,
         JIT_FLAG_TIER0                   = 39, // This is the initial tier for tiered compilation which should generate code as quickly as possible
         JIT_FLAG_TIER1                   = 40, // This is the final tier (for now) for tiered compilation which should generate high quality code
 
 #if defined(TARGET_ARM)
         JIT_FLAG_RELATIVE_CODE_RELOCS    = 41, // JIT should generate PC-relative address computations instead of EE relocation records
 #else // !defined(TARGET_ARM)
-        JIT_FLAG_UNUSED14                = 41,
+        JIT_FLAG_UNUSED15                = 41,
 #endif // !defined(TARGET_ARM)
 
         JIT_FLAG_NO_INLINING             = 42, // JIT should not inline any called method into this method
 
-        JIT_FLAG_UNUSED15                = 43,
-        JIT_FLAG_UNUSED16                = 44,
-        JIT_FLAG_UNUSED17                = 45,
-        JIT_FLAG_UNUSED18                = 46,
-        JIT_FLAG_UNUSED19                = 47,
-        JIT_FLAG_UNUSED20                = 48,
-        JIT_FLAG_UNUSED21                = 49,
-        JIT_FLAG_UNUSED22                = 50,
-        JIT_FLAG_UNUSED23                = 51,
-        JIT_FLAG_UNUSED24                = 52,
-        JIT_FLAG_UNUSED25                = 53,
-        JIT_FLAG_UNUSED26                = 54,
-        JIT_FLAG_UNUSED27                = 55,
-        JIT_FLAG_UNUSED28                = 56,
-        JIT_FLAG_UNUSED29                = 57,
-        JIT_FLAG_UNUSED30                = 58,
-        JIT_FLAG_UNUSED31                = 59,
-        JIT_FLAG_UNUSED32                = 60,
-        JIT_FLAG_UNUSED33                = 61,
-        JIT_FLAG_UNUSED34                = 62,
-        JIT_FLAG_UNUSED35                = 63
+        JIT_FLAG_UNUSED16                = 43,
+        JIT_FLAG_UNUSED17                = 44,
+        JIT_FLAG_UNUSED18                = 45,
+        JIT_FLAG_UNUSED19                = 46,
+        JIT_FLAG_UNUSED20                = 47,
+        JIT_FLAG_UNUSED21                = 48,
+        JIT_FLAG_UNUSED22                = 49,
+        JIT_FLAG_UNUSED23                = 50,
+        JIT_FLAG_UNUSED24                = 51,
+        JIT_FLAG_UNUSED25                = 52,
+        JIT_FLAG_UNUSED26                = 53,
+        JIT_FLAG_UNUSED27                = 54,
+        JIT_FLAG_UNUSED28                = 55,
+        JIT_FLAG_UNUSED29                = 56,
+        JIT_FLAG_UNUSED30                = 57,
+        JIT_FLAG_UNUSED31                = 58,
+        JIT_FLAG_UNUSED32                = 59,
+        JIT_FLAG_UNUSED33                = 60,
+        JIT_FLAG_UNUSED34                = 61,
+        JIT_FLAG_UNUSED35                = 62,
+        JIT_FLAG_UNUSED36                = 63
 
     };
     // clang-format on
diff --git a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs
index 49b74e401cc430..1aadd4e2664542 100644
--- a/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs
+++ b/src/coreclr/tools/Common/JitInterface/CorInfoTypes.cs
@@ -1307,13 +1307,13 @@ public enum CorJitFlag : uint
         CORJIT_FLAG_BBINSTR = 29, // Collect basic block profile information
         CORJIT_FLAG_BBOPT = 30, // Optimize method based on profile information
         CORJIT_FLAG_FRAMED = 31, // All methods have an EBP frame
-        CORJIT_FLAG_UNUSED10 = 32,
+        CORJIT_FLAG_UNUSED8 = 32,
         CORJIT_FLAG_PUBLISH_SECRET_PARAM = 33, // JIT must place stub secret param into local 0.  (used by IL stubs)
-        CORJIT_FLAG_UNUSED8 = 34,
+        CORJIT_FLAG_UNUSED9 = 34,
         CORJIT_FLAG_SAMPLING_JIT_BACKGROUND = 35, // JIT is being invoked as a result of stack sampling for hot methods in the background
         CORJIT_FLAG_USE_PINVOKE_HELPERS = 36, // The JIT should use the PINVOKE_{BEGIN,END} helpers instead of emitting inline transitions
         CORJIT_FLAG_REVERSE_PINVOKE = 37, // The JIT should insert REVERSE_PINVOKE_{ENTER,EXIT} helpers into method prolog/epilog
-        CORJIT_FLAG_UNUSED9 = 38,
+        CORJIT_FLAG_UNUSED10 = 38,
         CORJIT_FLAG_TIER0 = 39, // This is the initial tier for tiered compilation which should generate code as quickly as possible
         CORJIT_FLAG_TIER1 = 40, // This is the final tier (for now) for tiered compilation which should generate high quality code
         CORJIT_FLAG_RELATIVE_CODE_RELOCS = 41, // JIT should generate PC-relative address computations instead of EE relocation records

From b100226b09f32f0c96c13cad956cc4b5f60fec17 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Fri, 8 Jan 2021 22:25:36 -0800
Subject: [PATCH 58/59] review feedback

---
 src/coreclr/jit/codegencommon.cpp |  2 +-
 src/coreclr/jit/codegenlinear.cpp |  8 +++++---
 src/coreclr/jit/emit.cpp          | 31 ++++++++++++++++++++++---------
 src/coreclr/jit/emit.h            | 14 +++++++++-----
 src/coreclr/jit/emitxarch.cpp     |  6 ++++--
 src/coreclr/jit/jitconfigvalues.h |  2 +-
 src/coreclr/jit/morph.cpp         |  2 +-
 src/coreclr/jit/optimizer.cpp     |  6 +++---
 8 files changed, 46 insertions(+), 25 deletions(-)

diff --git a/src/coreclr/jit/codegencommon.cpp b/src/coreclr/jit/codegencommon.cpp
index a2c9f4a4b08c45..8c4572dcec43f5 100644
--- a/src/coreclr/jit/codegencommon.cpp
+++ b/src/coreclr/jit/codegencommon.cpp
@@ -2258,7 +2258,7 @@ void CodeGen::genGenerateMachineCode()
 
     GetEmitter()->emitJumpDistBind();
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
     /* Perform alignment adjustments */
 
     GetEmitter()->emitLoopAlignAdjustments();
diff --git a/src/coreclr/jit/codegenlinear.cpp b/src/coreclr/jit/codegenlinear.cpp
index a2796969b26015..215e3c04f75b59 100644
--- a/src/coreclr/jit/codegenlinear.cpp
+++ b/src/coreclr/jit/codegenlinear.cpp
@@ -349,11 +349,13 @@ void CodeGen::genCodeForBBlist()
             needLabel = true;
         }
 
-        if (GetEmitter()->emitCurIG->isLoopAlign())
+#if FEATURE_LOOP_ALIGN
+        if (GetEmitter()->emitEndsWithAlignInstr())
         {
             // we had better be planning on starting a new IG
             assert(needLabel);
         }
+#endif
 
         if (needLabel)
         {
@@ -745,7 +747,7 @@ void CodeGen::genCodeForBBlist()
 
             case BBJ_COND:
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
                 // This is the last place where we operate on blocks and after this, we operate
                 // on IG. Hence, if we know that the destination of "block" is the first block
                 // of a loop and needs alignment (it has BBF_LOOP_ALIGN), then "block" represents
@@ -766,7 +768,7 @@ void CodeGen::genCodeForBBlist()
                 break;
         }
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
 
         // If next block is the first block of a loop (identified by BBF_LOOP_ALIGN),
         // then need to add align instruction in current "block". Also mark the
diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
index 146c52963b9ed4..b42111611504d7 100644
--- a/src/coreclr/jit/emit.cpp
+++ b/src/coreclr/jit/emit.cpp
@@ -641,7 +641,7 @@ void emitter::emitGenIG(insGroup* ig)
 
     assert(emitCurIGjmpList == nullptr);
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
     assert(emitCurIGAlignList == nullptr);
 #endif
 
@@ -831,7 +831,7 @@ insGroup* emitter::emitSavIG(bool emitAdd)
     }
 #endif
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
     // Did we have any align instructions in this group?
     if (emitCurIGAlignList)
     {
@@ -996,7 +996,7 @@ void emitter::emitBegFN(bool hasFramePtr
     emitCurIGfreeBase = nullptr;
     emitIGbuffSize    = 0;
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
     emitLastAlignedIgNum        = 0;
     emitLastInnerLoopStartIgNum = 0;
     emitLastInnerLoopEndIgNum   = 0;
@@ -1037,7 +1037,7 @@ void emitter::emitBegFN(bool hasFramePtr
     emitNoGCIG     = false;
     emitForceNewIG = false;
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
     /* We don't have any align instructions */
 
     emitAlignList = emitAlignLast = nullptr;
@@ -3735,7 +3735,7 @@ size_t emitter::emitIssue1Instr(insGroup* ig, instrDesc* id, BYTE** dp)
         // It is fatal to under-estimate the instruction size, except for alignment instructions
         noway_assert(estimatedSize >= actualSize);
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
         // Should never over-estimate align instruction or any instruction before the last align instruction of a method
         assert(id->idIns() != INS_align && emitCurIG->igNum > emitLastAlignedIgNum);
 #endif
@@ -4610,7 +4610,7 @@ void emitter::emitJumpDistBind()
 #endif // DEBUG
 }
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
 
 //-----------------------------------------------------------------------------
 // emitLoopAlignment: Insert an align instruction at the end of emitCurIG and
@@ -4636,6 +4636,16 @@ void emitter::emitLoopAlignment()
             emitComp->compMethodID, emitCurIG->igNum);
 }
 
+//-----------------------------------------------------------------------------
+//  emitEndsWithAlignInstr: Checks if current IG ends with loop align instruction.
+//
+//  Returns:  true if current IG ends with align instruciton.
+//
+bool emitter::emitEndsWithAlignInstr()
+{
+    return emitCurIG->isLoopAlign();
+}
+
 //-----------------------------------------------------------------------------
 //  getLoopSize: Starting from loopHeaderIg, find the size of the smallest possible loop
 //               such that it doesn't exceed the maxLoopSize.
@@ -7762,10 +7772,13 @@ void emitter::emitInitIG(insGroup* ig)
        sure we act the same in non-DEBUG builds.
     */
 
-    ig->igSize         = 0;
-    ig->igGCregs       = RBM_NONE;
-    ig->igInsCnt       = 0;
+    ig->igSize   = 0;
+    ig->igGCregs = RBM_NONE;
+    ig->igInsCnt = 0;
+
+#if FEATURE_LOOP_ALIGN
     ig->igLoopBackEdge = nullptr;
+#endif
 }
 
 /*****************************************************************************
diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
index 98f6167a29cb80..8030cc4b0fb16d 100644
--- a/src/coreclr/jit/emit.h
+++ b/src/coreclr/jit/emit.h
@@ -250,7 +250,10 @@ struct insGroup
     unsigned int   igFuncIdx; // Which function/funclet does this belong to? (Index into Compiler::compFuncInfos array.)
     unsigned short igFlags;   // see IGF_xxx below
     unsigned short igSize;    // # of bytes of code in this group
-    insGroup*      igLoopBackEdge; // "last" back-edge that branches back to an aligned loop head.
+
+#if FEATURE_LOOP_ALIGN
+    insGroup* igLoopBackEdge; // "last" back-edge that branches back to an aligned loop head.
+#endif
 
 #define IGF_GC_VARS 0x0001    // new set of live GC ref variables
 #define IGF_BYREF_REGS 0x0002 // new set of live by-ref registers
@@ -1370,7 +1373,7 @@ class emitter
                                   // hot to cold and cold to hot jumps)
     };
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
     struct instrDescAlign : instrDesc
     {
         instrDescAlign* idaNext; // next align in the group/method
@@ -1755,7 +1758,7 @@ class emitter
     instrDescJmp* emitJumpLast;       // last of local jumps in method
     void          emitJumpDistBind(); // Bind all the local jumps in method
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
     instrDescAlign* emitCurIGAlignList;                                 // list of align instructions in current IG
     unsigned        emitLastInnerLoopStartIgNum;                        // Start IG of last inner loop
     unsigned        emitLastInnerLoopEndIgNum;                          // End IG of last inner loop
@@ -1764,6 +1767,7 @@ class emitter
     instrDescAlign* emitAlignLast;                                      // last align instruction in method
     unsigned getLoopSize(insGroup* igLoopHeader, unsigned maxLoopSize); // Get the smallest loop size
     void emitLoopAlignment();
+    bool emitEndsWithAlignInstr(); // Validate if newLabel is appropriate
     void emitSetLoopBackEdge(BasicBlock* loopTopBlock);
     void     emitLoopAlignAdjustments(); // Predict if loop alignment is needed and make appropriate adjustments
     unsigned emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offset DEBUG_ARG(bool displayAlignmentDetails));
@@ -2009,7 +2013,7 @@ class emitter
         return (instrDescCGCA*)emitAllocAnyInstr(sizeof(instrDescCGCA), attr);
     }
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
     instrDescAlign* emitAllocInstrAlign()
     {
 #if EMITTER_STATS
@@ -2544,7 +2548,7 @@ inline emitter::instrDescJmp* emitter::emitNewInstrJmp()
     return emitAllocInstrJmp();
 }
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
 inline emitter::instrDescAlign* emitter::emitNewInstrAlign()
 {
     instrDescAlign* newInstr = emitAllocInstrAlign();
diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index 5c7941b1515886..fef1caca8cf7d7 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -7388,7 +7388,7 @@ size_t emitter::emitSizeOfInsDsc(instrDesc* id)
     switch (idOp)
     {
         case ID_OP_NONE:
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
             if (id->idIns() == INS_align)
             {
                 return sizeof(instrDescAlign);
@@ -13814,8 +13814,9 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
     {
         emitDispIns(id, false, dspOffs, true, emitCurCodeOffs(*dp), *dp, (dst - *dp));
     }
+#endif
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
     // Only compensate over-estimated instructions if emitCurIG is before
     // the last IG that needs alignment.
     if (emitCurIG->igNum <= emitLastAlignedIgNum)
@@ -13852,6 +13853,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
     }
 #endif
 
+#ifdef DEBUG
     if (emitComp->compDebugBreak)
     {
         // set JitEmitPrintRefRegs=1 will print out emitThisGCrefRegs and emitThisByrefRegs
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index 81ecb0e8c52c69..5ffab7c0f29e96 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -223,7 +223,7 @@ CONFIG_INTEGER(EnableIncompleteISAClass, W("EnableIncompleteISAClass"), 0) // En
                                                                            // intrinsic classes
 #endif                                                                     // defined(DEBUG)
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
 CONFIG_INTEGER(JitAlignLoops, W("JitAlignLoops"), 1) // If set, align inner loops
 #else
 CONFIG_INTEGER(JitAlignLoops, W("JitAlignLoops"), 0)
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index f65f0edfd022a9..e286bc26d3fba7 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -16317,7 +16317,7 @@ bool Compiler::fgFoldConditional(BasicBlock* block)
                          * Remove the loop from the table */
 
                         optLoopTable[loopNum].lpFlags |= LPFLG_REMOVED;
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
                         optLoopTable[loopNum].lpFirst->bbFlags &= ~BBF_LOOP_ALIGN;
                         JITDUMP("Removing LOOP_ALIGN flag from bogus loop in " FMT_BB "\n",
                                 optLoopTable[loopNum].lpFirst->bbNum);
diff --git a/src/coreclr/jit/optimizer.cpp b/src/coreclr/jit/optimizer.cpp
index 3f58496de28d7e..ddadd938fcfc68 100644
--- a/src/coreclr/jit/optimizer.cpp
+++ b/src/coreclr/jit/optimizer.cpp
@@ -2586,7 +2586,7 @@ void Compiler::optFindNaturalLoops()
 
 void Compiler::optIdentifyLoopsForAlignment()
 {
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
     if (codeGen->ShouldAlignLoops())
     {
         for (unsigned char loopInd = 0; loopInd < optLoopCount; loopInd++)
@@ -3792,7 +3792,7 @@ void Compiler::optUnrollLoops()
 #endif
         }
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
         for (block = head->bbNext;; block = block->bbNext)
         {
             if (block->isLoopAlign())
@@ -8032,7 +8032,7 @@ bool Compiler::optComputeLoopSideEffectsOfBlock(BasicBlock* blk)
 void Compiler::AddContainsCallAllContainingLoops(unsigned lnum)
 {
 
-#ifdef FEATURE_LOOP_ALIGN
+#if FEATURE_LOOP_ALIGN
     // If this is the inner most loop, reset the LOOP_ALIGN flag
     // because a loop having call will not likely to benefit from
     // alignment

From dbeb7d62f22a4792500ba8939d0267cc09778f08 Mon Sep 17 00:00:00 2001
From: Kunal Pathak <Kunal.Pathak@microsoft.com>
Date: Mon, 11 Jan 2021 17:17:29 -0800
Subject: [PATCH 59/59] Add missing #ifdef DEBUG

---
 src/coreclr/jit/emitxarch.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/coreclr/jit/emitxarch.cpp b/src/coreclr/jit/emitxarch.cpp
index fef1caca8cf7d7..b6ca4dd7030a3e 100644
--- a/src/coreclr/jit/emitxarch.cpp
+++ b/src/coreclr/jit/emitxarch.cpp
@@ -13825,6 +13825,8 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
         assert(diff >= 0);
         if (diff != 0)
         {
+
+#ifdef DEBUG
             // should never over-estimate align instruction
             assert(id->idIns() != INS_align);
             JITDUMP("Added over-estimation compensation: %d\n", diff);
@@ -13834,6 +13836,7 @@ size_t emitter::emitOutputInstr(insGroup* ig, instrDesc* id, BYTE** dp)
                 emitDispInsAddr(dst);
                 printf("\t\t  ;; NOP compensation instructions of %d bytes.\n", diff);
             }
+#endif
 
             dst = emitOutputNOP(dst, diff);