From a8616d941556bc03e2519aac062761489083681f Mon Sep 17 00:00:00 2001
From: Jakob Botsch Nielsen <Jakob.botsch.nielsen@gmail.com>
Date: Thu, 4 Jul 2024 10:46:42 +0200
Subject: [PATCH] JIT: Add a disabled-by-default implementation of strength
 reduction (#104243)

This adds a disabled-by-default implementation of strength reduction. At
this point the implementation should be correct, however it is currently
both a size and perfscore regression when it is enabled. More work will
be needed to get the heuristics right and to make it kick in for more
cases.

Strength reduction replaces "expensive" operations computed on every
loop iteration with cheaper ones by creating more induction
variables. In C# terms it effectively transforms something like

```
private struct S
{
    public int A, B, C;
}

[MethodImpl(MethodImplOptions.NoInlining)]
private static float Sum(S[] ss)
{
    int sum = 0;
    foreach (S v in ss)
    {
        sum += v.A;
        sum += v.B;
        sum += v.C;
    }

    return sum;
}
```

into an equivalent
```
int sum = 0;
ref S curS = ref ss[0];
for (int i = 0; i < ss.Length; i++)
{
  sum += curS.A;
  sum += curS.B;
  sum += curS.C;
  curS = ref Unsafe.Add(ref curS, 1);
}
```

With strength reduction enabled this PR thus changes codegen of the
standard `foreach` version above from
```asm
G_M63518_IG03:  ;; offset=0x0011
       lea      r10, [rdx+2*rdx]
       lea      r10, bword ptr [rcx+4*r10+0x10]
       mov      r9d, dword ptr [r10]
       mov      r11d, dword ptr [r10+0x04]
       mov      r10d, dword ptr [r10+0x08]
       add      eax, r9d
       add      eax, r11d
       add      eax, r10d
       inc      edx
       cmp      r8d, edx
       jg       SHORT G_M63518_IG03
						;; size=36 bbWeight=4 PerfScore 39.00
```

to
```asm
G_M63518_IG04:  ;; offset=0x0011
       mov      r8, rcx
       mov      r10d, dword ptr [r8]
       mov      r9d, dword ptr [r8+0x04]
       mov      r8d, dword ptr [r8+0x08]
       add      eax, r10d
       add      eax, r9d
       add      eax, r8d
       add      rcx, 12
       dec      edx
       jne      SHORT G_M63518_IG04
						;; size=31 bbWeight=4 PerfScore 34.00
```
on x64. Further improvements can be made to enable better address modes.

The current heuristics try to ensure that we do not actually end up with
more primary induction variables. The strength reduction only kicks in
when it thinks that all uses of the primary IV can be replaced by the
new primary IV. However, uses inside loop exit tests are allowed to stay
unreplaced by the assumption that the downwards loop transformation
will be able to get rid of them.

Getting the cases around overflow right turned out to be hard and
required reasoning about trip counts that was added in a previous PR.
Generally, the issue is that we need to prove that transforming a zero
extension of an add recurrence to a 64-bit add recurrence is legal. For
example, for a simple case of
```
for (int i = 0; i < arr.Length; i++)
  sum += arr[i];
```

the IV analysis is eventually going to end up wanting to show that
`zext<64>(int32 <L, 0, 1>) => int64 <L, 0, 1>` is a correct
transformation. This requires showing that the add recurrence does not
step past 2^32-1, which requires the bound on the trip count that we can
now compute. The reasoning done for both the trip count and around the
overflow is still very limited but can be improved incrementally.

The implementation works by considering every primary IV of the loop in
turn, and by initializing 'cursors' pointing to each use of the primary
IV. It then tries to repeatedly advance these cursors to the parent of
the uses while it results in a new set of cursors that still compute the
same (now derived) IV. If it manages to do this once, then replacing the
cursors by a new primary IV should result in the old primary IV no
longer being necessary, while having replaced some operations by cheaper
ones.
---
 src/coreclr/jit/arraystack.h              |   5 +
 src/coreclr/jit/compiler.h                |   6 +-
 src/coreclr/jit/inductionvariableopts.cpp | 767 +++++++++++++++++++++-
 src/coreclr/jit/jitconfigvalues.h         |   3 +
 src/coreclr/jit/jitmetadatalist.h         |   1 +
 src/coreclr/jit/scev.cpp                  | 236 ++++++-
 src/coreclr/jit/scev.h                    |  16 +-
 7 files changed, 1000 insertions(+), 34 deletions(-)

diff --git a/src/coreclr/jit/arraystack.h b/src/coreclr/jit/arraystack.h
index 5d8a697a3820d..872e64f3c8c99 100644
--- a/src/coreclr/jit/arraystack.h
+++ b/src/coreclr/jit/arraystack.h
@@ -121,6 +121,11 @@ class ArrayStack
         tosIndex = 0;
     }
 
+    T* Data()
+    {
+        return data;
+    }
+
 private:
     CompAllocator m_alloc;
     int           tosIndex; // first free location
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 633653101e01a..488d6a01de9e1 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -6430,11 +6430,9 @@ class Compiler
     Statement* fgNewStmtAtEnd(BasicBlock* block, GenTree* tree, const DebugInfo& di = DebugInfo());
     Statement* fgNewStmtNearEnd(BasicBlock* block, GenTree* tree, const DebugInfo& di = DebugInfo());
 
-private:
     void fgInsertStmtNearEnd(BasicBlock* block, Statement* stmt);
     void fgInsertStmtAtBeg(BasicBlock* block, Statement* stmt);
 
-public:
     void fgInsertStmtAfter(BasicBlock* block, Statement* insertionPoint, Statement* stmt);
     void fgInsertStmtBefore(BasicBlock* block, Statement* insertionPoint, Statement* stmt);
 
@@ -7563,6 +7561,8 @@ class Compiler
 
     PhaseStatus optInductionVariables();
 
+    template <typename TFunctor>
+    void optVisitBoundingExitingCondBlocks(FlowGraphNaturalLoop* loop, TFunctor func);
     bool optMakeLoopDownwardsCounted(ScalarEvolutionContext& scevContext,
                                      FlowGraphNaturalLoop*   loop,
                                      LoopLocalOccurrences*   loopLocals);
@@ -10345,6 +10345,8 @@ class Compiler
         STRESS_MODE(OPT_REPEAT) /* stress JitOptRepeat */                                       \
         STRESS_MODE(INITIAL_PARAM_REG) /* Stress initial register assigned to parameters */     \
         STRESS_MODE(DOWNWARDS_COUNTED_LOOPS) /* Make more loops downwards counted         */    \
+        STRESS_MODE(STRENGTH_REDUCTION) /* Enable strength reduction */                         \
+        STRESS_MODE(STRENGTH_REDUCTION_PROFITABILITY) /* Do more strength reduction */          \
                                                                                                 \
         /* After COUNT_VARN, stress level 2 does all of these all the time */                   \
                                                                                                 \
diff --git a/src/coreclr/jit/inductionvariableopts.cpp b/src/coreclr/jit/inductionvariableopts.cpp
index 4311bb0c3ce42..bdbece4cc3614 100644
--- a/src/coreclr/jit/inductionvariableopts.cpp
+++ b/src/coreclr/jit/inductionvariableopts.cpp
@@ -67,6 +67,8 @@ class LoopLocalOccurrences
 
     template <typename TFunc>
     bool VisitStatementsWithOccurrences(FlowGraphNaturalLoop* loop, unsigned lclNum, TFunc func);
+
+    void Invalidate(FlowGraphNaturalLoop* loop);
 };
 
 LoopLocalOccurrences::LoopLocalOccurrences(FlowGraphNaturalLoops* loops)
@@ -184,7 +186,7 @@ bool LoopLocalOccurrences::VisitLoopNestMaps(FlowGraphNaturalLoop* loop, TFunc&
 //   Visit all occurrences of the specified local inside the loop.
 //
 // Type parameters:
-//   TFunc - Functor of type bool(Block*, Statement*, GenTree*)
+//   TFunc - Functor of type bool(Block*, Statement*, GenTreeLclVarCommon*)
 //
 // Parameters:
 //   loop   - The loop
@@ -240,7 +242,7 @@ bool LoopLocalOccurrences::VisitOccurrences(FlowGraphNaturalLoop* loop, unsigned
 //
 bool LoopLocalOccurrences::HasAnyOccurrences(FlowGraphNaturalLoop* loop, unsigned lclNum)
 {
-    if (!VisitOccurrences(loop, lclNum, [](BasicBlock* block, Statement* stmt, GenTree* tree) {
+    if (!VisitOccurrences(loop, lclNum, [](BasicBlock* block, Statement* stmt, GenTreeLclVarCommon* tree) {
         return false;
     }))
     {
@@ -314,6 +316,32 @@ bool LoopLocalOccurrences::VisitStatementsWithOccurrences(FlowGraphNaturalLoop*
     return VisitLoopNestMaps(loop, visitor);
 }
 
+//------------------------------------------------------------------------
+// Invalidate: Invalidate all information about locals in the specified loop
+// and its child loops.
+//
+// Parameters:
+//   loop - The loop
+//
+void LoopLocalOccurrences::Invalidate(FlowGraphNaturalLoop* loop)
+{
+    for (FlowGraphNaturalLoop* child = loop->GetChild(); child != nullptr; child = child->GetSibling())
+    {
+        Invalidate(child);
+    }
+
+    if (m_maps[loop->GetIndex()] != nullptr)
+    {
+        m_maps[loop->GetIndex()] = nullptr;
+
+        BitVecTraits poTraits = m_loops->GetDfsTree()->PostOrderTraits();
+        loop->VisitLoopBlocks([=, &poTraits](BasicBlock* block) {
+            BitVecOps::RemoveElemD(&poTraits, m_visitedBlocks, block->bbPostorderNum);
+            return BasicBlockVisit::Continue;
+        });
+    }
+}
+
 //------------------------------------------------------------------------
 // optCanSinkWidenedIV: Check to see if we are able to sink a store to the old
 // local into the exits of a loop if we decide to widen.
@@ -862,23 +890,17 @@ bool Compiler::optWidenPrimaryIV(FlowGraphNaturalLoop* loop,
 }
 
 //------------------------------------------------------------------------
-// optMakeLoopDownwardsCounted: Transform a loop to be downwards counted if
-// profitable and legal.
+// optVisitBoundingExitingBlocks: Visit all the exiting BBJ_COND blocks of the
+// loop that dominate all the loop's backedges. These exiting blocks bound the
+// trip count of the loop.
 //
 // Parameters:
-//   scevContext - Context for scalar evolution
-//   loop        - Loop to transform
-//   loopLocals  - Data structure that tracks occurrences of locals in the loop
-//
-// Returns:
-//   True if the loop was made downwards counted; otherwise false.
+//   loop - The loop
+//   func - The functor, of type void(BasicBlock*).
 //
-bool Compiler::optMakeLoopDownwardsCounted(ScalarEvolutionContext& scevContext,
-                                           FlowGraphNaturalLoop*   loop,
-                                           LoopLocalOccurrences*   loopLocals)
+template <typename TFunctor>
+void Compiler::optVisitBoundingExitingCondBlocks(FlowGraphNaturalLoop* loop, TFunctor func)
 {
-    JITDUMP("Checking if we should make " FMT_LP " downwards counted\n", loop->GetIndex());
-
     BasicBlock* dominates = nullptr;
 
     for (FlowEdge* backEdge : loop->BackEdges())
@@ -899,13 +921,37 @@ bool Compiler::optMakeLoopDownwardsCounted(ScalarEvolutionContext& scevContext,
         if (dominates->KindIs(BBJ_COND) &&
             (!loop->ContainsBlock(dominates->GetTrueTarget()) || !loop->ContainsBlock(dominates->GetFalseTarget())))
         {
-            JITDUMP("  Considering exiting block " FMT_BB "\n", dominates->bbNum);
             // 'dominates' is an exiting block that dominates all backedges.
-            changed |= optMakeExitTestDownwardsCounted(scevContext, loop, dominates, loopLocals);
+            func(dominates);
         }
 
         dominates = dominates->bbIDom;
     }
+}
+
+//------------------------------------------------------------------------
+// optMakeLoopDownwardsCounted: Transform a loop to be downwards counted if
+// profitable and legal.
+//
+// Parameters:
+//   scevContext - Context for scalar evolution
+//   loop        - Loop to transform
+//   loopLocals  - Data structure that tracks occurrences of locals in the loop
+//
+// Returns:
+//   True if the loop was made downwards counted; otherwise false.
+//
+bool Compiler::optMakeLoopDownwardsCounted(ScalarEvolutionContext& scevContext,
+                                           FlowGraphNaturalLoop*   loop,
+                                           LoopLocalOccurrences*   loopLocals)
+{
+    JITDUMP("Checking if we should make " FMT_LP " downwards counted\n", loop->GetIndex());
+
+    bool changed = false;
+    optVisitBoundingExitingCondBlocks(loop, [=, &scevContext, &changed](BasicBlock* exiting) {
+        JITDUMP("  Considering exiting block " FMT_BB "\n", exiting->bbNum);
+        changed |= optMakeExitTestDownwardsCounted(scevContext, loop, exiting, loopLocals);
+    });
 
     return changed;
 }
@@ -1147,6 +1193,684 @@ bool Compiler::optMakeExitTestDownwardsCounted(ScalarEvolutionContext& scevConte
     return true;
 }
 
+struct CursorInfo
+{
+    BasicBlock* Block;
+    Statement*  Stmt;
+    GenTree*    Tree;
+    ScevAddRec* IV;
+    bool        IsInsideExitTest = false;
+
+    CursorInfo(BasicBlock* block, Statement* stmt, GenTree* tree, ScevAddRec* iv, bool isInsideExitTest)
+        : Block(block)
+        , Stmt(stmt)
+        , Tree(tree)
+        , IV(iv)
+        , IsInsideExitTest(isInsideExitTest)
+    {
+    }
+};
+
+class StrengthReductionContext
+{
+    Compiler*               m_comp;
+    ScalarEvolutionContext& m_scevContext;
+    FlowGraphNaturalLoop*   m_loop;
+    LoopLocalOccurrences&   m_loopLocals;
+
+    ArrayStack<Scev*>         m_backEdgeBounds;
+    SimplificationAssumptions m_simplAssumptions;
+    ArrayStack<CursorInfo>    m_cursors1;
+    ArrayStack<CursorInfo>    m_cursors2;
+
+    void        InitializeSimplificationAssumptions();
+    bool        InitializeCursors(GenTreeLclVarCommon* primaryIVLcl, ScevAddRec* primaryIV);
+    void        AdvanceCursors(ArrayStack<CursorInfo>* cursors, ArrayStack<CursorInfo>* nextCursors);
+    bool        CheckAdvancedCursors(ArrayStack<CursorInfo>* cursors, int derivedLevel, ScevAddRec** nextIV);
+    bool        TryReplaceUsesWithNewPrimaryIV(ArrayStack<CursorInfo>* cursors, ScevAddRec* iv);
+    BasicBlock* FindUpdateInsertionPoint(ArrayStack<CursorInfo>* cursors);
+
+    bool StressProfitability()
+    {
+        return m_comp->compStressCompile(Compiler::STRESS_STRENGTH_REDUCTION_PROFITABILITY, 50);
+    }
+
+public:
+    StrengthReductionContext(Compiler*               comp,
+                             ScalarEvolutionContext& scevContext,
+                             FlowGraphNaturalLoop*   loop,
+                             LoopLocalOccurrences&   loopLocals)
+        : m_comp(comp)
+        , m_scevContext(scevContext)
+        , m_loop(loop)
+        , m_loopLocals(loopLocals)
+        , m_backEdgeBounds(comp->getAllocator(CMK_LoopIVOpts))
+        , m_cursors1(comp->getAllocator(CMK_LoopIVOpts))
+        , m_cursors2(comp->getAllocator(CMK_LoopIVOpts))
+    {
+    }
+
+    bool TryStrengthReduce();
+};
+
+//------------------------------------------------------------------------
+// TryStrengthReduce: Check for legal and profitable derived IVs to introduce
+// new primary IVs for.
+//
+// Returns:
+//   True if any new primary IV was introduced; otherwise false.
+//
+bool StrengthReductionContext::TryStrengthReduce()
+{
+    JITDUMP("Considering " FMT_LP " for strength reduction...\n", m_loop->GetIndex());
+
+    if ((JitConfig.JitEnableStrengthReduction() == 0) &&
+        !m_comp->compStressCompile(Compiler::STRESS_STRENGTH_REDUCTION, 50))
+    {
+        JITDUMP("  Disabled: no stress mode\n");
+        return false;
+    }
+
+    // Compute information about the loop used to simplify SCEVs.
+    InitializeSimplificationAssumptions();
+
+    JITDUMP("  Considering primary IVs\n");
+
+    // We strength reduce only candidates where we see that we'll be able to
+    // remove all uses of a primary IV by introducing a different primary IV.
+    //
+    // The algorithm here works in the following way: we process each primary
+    // IV in turn. For every primary IV, we create a 'cursor' pointing to every
+    // use of that primary IV. We then continuously advance each cursor to the
+    // parent node as long as all cursors represent the same derived IV. Once we
+    // find out that the cursors are no longer the same derived IV we stop.
+    //
+    // We keep two lists here so that we can keep track of the most advanced
+    // cursor where all cursors pointed to the same derived IV, in which case
+    // we can strength reduce.
+
+    bool strengthReducedAny = false;
+    for (Statement* stmt : m_loop->GetHeader()->Statements())
+    {
+        if (!stmt->IsPhiDefnStmt())
+        {
+            break;
+        }
+
+        DISPSTMT(stmt);
+
+        GenTreeLclVarCommon* primaryIVLcl = stmt->GetRootNode()->AsLclVarCommon();
+        Scev*                candidate    = m_scevContext.Analyze(m_loop->GetHeader(), primaryIVLcl);
+        if (candidate == nullptr)
+        {
+            JITDUMP("  Could not analyze header PHI\n");
+            continue;
+        }
+
+        candidate = m_scevContext.Simplify(candidate, m_simplAssumptions);
+
+        JITDUMP("  => ");
+        DBEXEC(m_comp->verbose, candidate->Dump(m_comp));
+
+        JITDUMP("\n");
+        if (!candidate->OperIs(ScevOper::AddRec))
+        {
+            JITDUMP("  Not an addrec\n");
+            continue;
+        }
+
+        ScevAddRec* primaryIV = static_cast<ScevAddRec*>(candidate);
+
+        InitializeCursors(primaryIVLcl, primaryIV);
+
+        ArrayStack<CursorInfo>* cursors     = &m_cursors1;
+        ArrayStack<CursorInfo>* nextCursors = &m_cursors2;
+
+        int         derivedLevel = 0;
+        ScevAddRec* currentIV    = primaryIV;
+
+        while (true)
+        {
+            JITDUMP("  Advancing cursors to be %d-derived\n", derivedLevel + 1);
+
+            // Advance cursors and store the result in 'nextCursors'
+            AdvanceCursors(cursors, nextCursors);
+
+            // Verify that all cursors still represent the same IV
+            ScevAddRec* nextIV = nullptr;
+            if (!CheckAdvancedCursors(nextCursors, derivedLevel + 1, &nextIV))
+            {
+                break;
+            }
+            assert(nextIV != nullptr);
+
+            // We need more sanity checks to allow materializing GC-typed add
+            // recs. Otherwise we may eagerly form a GC pointer that was only
+            // lazily formed under some conditions before, which can be
+            // illegal. For now we just bail.
+            if (varTypeIsGC(nextIV->Type))
+            {
+                JITDUMP("    Next IV has type %s. Bailing.\n", varTypeName(nextIV->Type));
+                break;
+            }
+
+            derivedLevel++;
+            std::swap(cursors, nextCursors);
+            currentIV = nextIV;
+        }
+
+        if (derivedLevel <= 0)
+        {
+            continue;
+        }
+
+        JITDUMP("  All uses of primary IV V%02u are used to compute a %d-derived IV ", primaryIVLcl->GetLclNum(),
+                derivedLevel);
+        DBEXEC(VERBOSE, currentIV->Dump(m_comp));
+        JITDUMP("\n");
+
+        if (Scev::Equals(currentIV->Step, primaryIV->Step) && !StressProfitability())
+        {
+            JITDUMP("    Skipping: candidate has same step as primary IV\n");
+            continue;
+        }
+
+        if (TryReplaceUsesWithNewPrimaryIV(cursors, currentIV))
+        {
+            strengthReducedAny = true;
+            m_loopLocals.Invalidate(m_loop);
+        }
+    }
+
+    return strengthReducedAny;
+}
+
+//------------------------------------------------------------------------
+// InitializeSimplificationAssumptions: Compute assumptions that can be used
+// when simplifying SCEVs.
+//
+void StrengthReductionContext::InitializeSimplificationAssumptions()
+{
+    m_comp->optVisitBoundingExitingCondBlocks(m_loop, [=](BasicBlock* exiting) {
+        Scev* exitNotTakenCount = m_scevContext.ComputeExitNotTakenCount(exiting);
+        if (exitNotTakenCount != nullptr)
+        {
+            m_backEdgeBounds.Push(exitNotTakenCount);
+        }
+    });
+
+    m_simplAssumptions.BackEdgeTakenBound    = m_backEdgeBounds.Data();
+    m_simplAssumptions.NumBackEdgeTakenBound = static_cast<unsigned>(m_backEdgeBounds.Height());
+
+#ifdef DEBUG
+    if (m_comp->verbose)
+    {
+        printf("  Bound on backedge taken count is ");
+        if (m_simplAssumptions.NumBackEdgeTakenBound == 0)
+        {
+            printf("<unknown>\n");
+        }
+
+        const char* pref = m_simplAssumptions.NumBackEdgeTakenBound > 1 ? "min(" : "";
+        for (unsigned i = 0; i < m_simplAssumptions.NumBackEdgeTakenBound; i++)
+        {
+            printf("%s", pref);
+            m_simplAssumptions.BackEdgeTakenBound[i]->Dump(m_comp);
+        }
+
+        printf("%s\n", m_simplAssumptions.NumBackEdgeTakenBound > 1 ? ")" : "");
+    }
+#endif
+}
+
+//------------------------------------------------------------------------
+// InitializeCursors: Reset and initialize both cursor lists with information about all
+// uses of the specified primary IV.
+//
+// Parameters:
+//   primaryIVLcl - Local representing a candidate primary IV for strength reduction
+//   primaryIV    - SCEV for the candidate
+//
+// Returns:
+//   True if all uses were analyzed and cursors could be introduced for them
+//   all; otherwise false.
+//
+// Remarks:
+//   A cursor is created for a use when it represents the same value as the
+//   primary IV passed. The function will allow mismatching uses if the use is
+//   expected to be removed in the downwards loop transformation. Otherwise the
+//   function will fail.
+//
+//   It is not a correctness requirement that we remove all uses; if we end up
+//   not doing so (e.g. because a cursor was not created by this function),
+//   then we may just end up with extra primary IVs in the loop.
+//
+bool StrengthReductionContext::InitializeCursors(GenTreeLclVarCommon* primaryIVLcl, ScevAddRec* primaryIV)
+{
+    m_cursors1.Reset();
+    m_cursors2.Reset();
+
+    auto visitor = [=](BasicBlock* block, Statement* stmt, GenTreeLclVarCommon* tree) {
+        if (stmt->GetRootNode()->OperIsLocalStore())
+        {
+            GenTreeLclVarCommon* lcl = stmt->GetRootNode()->AsLclVarCommon();
+            if ((lcl->GetLclNum() == primaryIVLcl->GetLclNum()) && ((lcl->Data()->gtFlags & GTF_SIDE_EFFECT) == 0))
+            {
+                // Store to the primary IV without side effects; if we end
+                // up strength reducing, then this store is expected to be
+                // removed by making the loop downwards counted.
+                return true;
+            }
+        }
+
+        if (!tree->OperIs(GT_LCL_VAR))
+        {
+            return false;
+        }
+
+        bool isInsideExitTest =
+            block->KindIs(BBJ_COND) && (stmt == block->lastStmt()) &&
+            (!m_loop->ContainsBlock(block->GetTrueTarget()) || !m_loop->ContainsBlock(block->GetFalseTarget()));
+
+        if (tree->GetSsaNum() != primaryIVLcl->GetSsaNum())
+        {
+            // Most likely a post-incremented use of the primary IV; we
+            // could replace these as well, but currently we only handle
+            // the cases where we expect the use to be removed.
+            return isInsideExitTest;
+        }
+
+        Scev* iv = m_scevContext.Analyze(block, tree);
+        if (iv == nullptr)
+        {
+            // May not be able to analyze the use if it's mistyped (e.g.
+            // LCL_VAR<byref>(TYP_I_IMPL LclVarDsc)), or an int use of a long
+            // local.
+            // Just bail on these cases.
+            return false;
+        }
+
+        // If we _did_ manage to analyze it then we expect it to be the same IV
+        // as the primary IV.
+        assert(Scev::Equals(m_scevContext.Simplify(iv, m_simplAssumptions), primaryIV));
+
+        m_cursors1.Emplace(block, stmt, tree, primaryIV, isInsideExitTest);
+        m_cursors2.Emplace(block, stmt, tree, primaryIV, isInsideExitTest);
+        return true;
+    };
+
+    if (!m_loopLocals.VisitOccurrences(m_loop, primaryIVLcl->GetLclNum(), visitor) || (m_cursors1.Height() <= 0))
+    {
+        JITDUMP("  Could not create cursors for all loop uses of primary IV");
+        return false;
+    }
+
+    JITDUMP("  Found %d cursors using primary IV V%02u\n", m_cursors1.Height(), primaryIVLcl->GetLclNum());
+
+#ifdef DEBUG
+    if (m_comp->verbose)
+    {
+        for (int i = 0; i < m_cursors1.Height(); i++)
+        {
+            CursorInfo& cursor = m_cursors1.BottomRef(i);
+            printf("    [%d] [%06u]%s: ", i, Compiler::dspTreeID(cursor.Tree),
+                   cursor.IsInsideExitTest ? " (in-test)" : "");
+            cursor.IV->Dump(m_comp);
+            printf("\n");
+        }
+    }
+#endif
+
+    return true;
+}
+
+//------------------------------------------------------------------------
+// AdvanceCursors: Advance cursors stored in "cursors" and store the advanced
+// result in "nextCursors".
+//
+// Parameters:
+//   cursors     - [in] List of current cursors. Unmodified.
+//   nextCursors - [in, out] List of next cursors. The "Tree" and "IV" fields
+//                 of these cursors will be updated to point to the next derived
+//                 IV.
+//
+void StrengthReductionContext::AdvanceCursors(ArrayStack<CursorInfo>* cursors, ArrayStack<CursorInfo>* nextCursors)
+{
+    for (int i = 0; i < cursors->Height(); i++)
+    {
+        CursorInfo& cursor     = cursors->BottomRef(i);
+        CursorInfo& nextCursor = nextCursors->BottomRef(i);
+
+        assert((nextCursor.Block == cursor.Block) && (nextCursor.Stmt == cursor.Stmt) &&
+               (nextCursor.IsInsideExitTest == cursor.IsInsideExitTest));
+
+        nextCursor.Tree = cursor.Tree;
+        do
+        {
+            GenTree* cur    = nextCursor.Tree;
+            nextCursor.Tree = cur->gtGetParent(nullptr);
+
+            if ((nextCursor.Tree == nullptr) ||
+                (nextCursor.Tree->OperIs(GT_COMMA) && (nextCursor.Tree->gtGetOp1() == cur)))
+            {
+                nextCursor.IV = nullptr;
+                break;
+            }
+
+            // TODO-CQ: If this is now the source to a store, we can
+            // look for uses of the LHS local and add those as cursors
+            // as well.
+            Scev* parentIV = m_scevContext.Analyze(nextCursor.Block, nextCursor.Tree);
+            if (parentIV == nullptr)
+            {
+                nextCursor.IV = nullptr;
+                break;
+            }
+
+            parentIV = m_scevContext.Simplify(parentIV, m_simplAssumptions);
+            assert(parentIV != nullptr);
+            if (!parentIV->OperIs(ScevOper::AddRec))
+            {
+                nextCursor.IV = nullptr;
+                break;
+            }
+
+            nextCursor.IV = static_cast<ScevAddRec*>(parentIV);
+        } while (Scev::Equals(nextCursor.IV, cursor.IV));
+    }
+
+#ifdef DEBUG
+    if (m_comp->verbose)
+    {
+        for (int i = 0; i < nextCursors->Height(); i++)
+        {
+            CursorInfo& nextCursor = nextCursors->BottomRef(i);
+            printf("    [%d] [%06u]%s: ", i, nextCursor.Tree == nullptr ? 0 : Compiler::dspTreeID(nextCursor.Tree),
+                   nextCursor.IsInsideExitTest ? " (in-test)" : "");
+            if (nextCursor.IV == nullptr)
+            {
+                printf("<null IV>");
+            }
+            else
+            {
+                nextCursor.IV->Dump(m_comp);
+            }
+            printf("\n");
+        }
+    }
+#endif
+}
+
+//------------------------------------------------------------------------
+// CheckAdvancedCursors: Check whether the specified advanced cursors still
+// represent a valid set of cursors to introduce a new primary IV for.
+//
+// Parameters:
+//   cursors      - List of cursors that were advanced.
+//   derivedLevel - The derived level of the advanced IVs. That is, the number
+//                  of times they are derived from the primary IV.
+//   nextIV       - [out] The next derived IV from the subset of advanced
+//                  cursors to now consider strength reducing.
+//
+// Returns:
+//   True if all cursors still represent a common derived IV and would be
+//   replacable by a new primary IV computing it.
+//
+// Remarks:
+//   This function may remove cursors from m_cursors1 and m_cursors2 if it
+//   decides to no longer consider some cursors for strength reduction.
+//
+bool StrengthReductionContext::CheckAdvancedCursors(ArrayStack<CursorInfo>* cursors,
+                                                    int                     derivedLevel,
+                                                    ScevAddRec**            nextIV)
+{
+    *nextIV = nullptr;
+
+    for (int i = 0; i < cursors->Height(); i++)
+    {
+        CursorInfo& cursor = cursors->BottomRef(i);
+
+        // Uses inside the exit test only need to opportunistically
+        // match. We check these after.
+        if (cursor.IsInsideExitTest)
+        {
+            continue;
+        }
+
+        if ((cursor.IV != nullptr) && ((*nextIV == nullptr) || Scev::Equals(cursor.IV, *nextIV)))
+        {
+            *nextIV = cursor.IV;
+            continue;
+        }
+
+        JITDUMP("    [%d] does not match; will not advance\n", i);
+        return false;
+    }
+
+    // Now check all exit test uses.
+    for (int i = 0; i < cursors->Height(); i++)
+    {
+        CursorInfo& cursor = cursors->BottomRef(i);
+
+        if (!cursor.IsInsideExitTest)
+        {
+            continue;
+        }
+
+        if ((cursor.IV != nullptr) && ((*nextIV == nullptr) || Scev::Equals(cursor.IV, *nextIV)))
+        {
+            *nextIV = cursor.IV;
+            continue;
+        }
+
+        // Use inside exit test does not match.
+        if (derivedLevel <= 1)
+        {
+            // We weren't able to advance the match in the exit test at all; in
+            // this situation we expect the downwards optimization to be able
+            // to remove the use of the primary IV, so this is ok. Remove the
+            // cursor pointing to the use inside the test.
+            JITDUMP("    [%d] does not match, but is inside loop test; ignoring mismatch and removing cursor\n", i);
+
+            std::swap(m_cursors1.BottomRef(i), m_cursors1.TopRef(0));
+            std::swap(m_cursors2.BottomRef(i), m_cursors2.TopRef(0));
+
+            m_cursors1.Pop();
+            m_cursors2.Pop();
+
+            i--;
+        }
+        else
+        {
+            // We already found a derived IV in the exit test that matches, so
+            // stop here and allow the replacement to replace the uses of the
+            // current derived IV, including the one in the exit test
+            // statement.
+            JITDUMP("    [%d] does not match; will not advance\n", i);
+            return false;
+        }
+    }
+
+    return *nextIV != nullptr;
+}
+
+//------------------------------------------------------------------------
+// TryReplaceUsesWithNewPrimaryIV: Perform final sanity checks before
+// introducing a new primary IV and replacing the uses represented by the
+// specified cursors with it.
+//
+// Parameters:
+//   cursors - List of cursors representing uses to replace
+//   iv      - IV to introduce a primary IV for
+//
+// Returns:
+//   True if the IV was introduced and uses were rewritten.
+//
+bool StrengthReductionContext::TryReplaceUsesWithNewPrimaryIV(ArrayStack<CursorInfo>* cursors, ScevAddRec* iv)
+{
+    int64_t stepCns;
+    if (!iv->Step->GetConstantValue(m_comp, &stepCns))
+    {
+        // For other cases it's non-trivial to know if we can materialize
+        // the value as IR in the step block.
+        JITDUMP("    Skipping: step value is not a constant\n");
+        return false;
+    }
+
+    BasicBlock* insertionPoint = FindUpdateInsertionPoint(cursors);
+    if (insertionPoint == nullptr)
+    {
+        JITDUMP("    Skipping: could not find a legal insertion point for the new IV update\n");
+        return false;
+    }
+
+    BasicBlock* preheader = m_loop->EntryEdge(0)->getSourceBlock();
+    GenTree*    initValue = m_scevContext.Materialize(iv->Start);
+    if (initValue == nullptr)
+    {
+        JITDUMP("    Skipping: init value could not be materialized\n");
+        return false;
+    }
+
+    JITDUMP("    Strength reducing\n");
+
+    GenTree* stepValue = m_scevContext.Materialize(iv->Step);
+    assert(stepValue != nullptr);
+
+    unsigned   newPrimaryIV = m_comp->lvaGrabTemp(false DEBUGARG("Strength reduced derived IV"));
+    GenTree*   initStore    = m_comp->gtNewTempStore(newPrimaryIV, initValue);
+    Statement* initStmt     = m_comp->fgNewStmtFromTree(initStore);
+    m_comp->fgInsertStmtNearEnd(preheader, initStmt);
+
+    JITDUMP("    Inserting init statement in preheader " FMT_BB "\n", preheader->bbNum);
+    DISPSTMT(initStmt);
+
+    GenTree* nextValue =
+        m_comp->gtNewOperNode(GT_ADD, iv->Type, m_comp->gtNewLclVarNode(newPrimaryIV, iv->Type), stepValue);
+    GenTree*   stepStore = m_comp->gtNewTempStore(newPrimaryIV, nextValue);
+    Statement* stepStmt  = m_comp->fgNewStmtFromTree(stepStore);
+    m_comp->fgInsertStmtNearEnd(insertionPoint, stepStmt);
+
+    JITDUMP("    Inserting step statement in " FMT_BB "\n", insertionPoint->bbNum);
+    DISPSTMT(stepStmt);
+
+    // Replace uses.
+    for (int i = 0; i < cursors->Height(); i++)
+    {
+        CursorInfo& cursor = cursors->BottomRef(i);
+        GenTree*    newUse = m_comp->gtNewLclVarNode(newPrimaryIV, iv->Type);
+
+        JITDUMP("    Replacing use [%06u] with [%06u]. Before:\n", Compiler::dspTreeID(cursor.Tree),
+                Compiler::dspTreeID(newUse));
+        DISPSTMT(cursor.Stmt);
+
+        GenTree** use = nullptr;
+        if (cursor.Stmt->GetRootNode() == cursor.Tree)
+        {
+            use = cursor.Stmt->GetRootNodePointer();
+        }
+        else
+        {
+            cursor.Tree->gtGetParent(&use);
+            assert(use != nullptr);
+        }
+
+        GenTree* sideEffects = nullptr;
+        m_comp->gtExtractSideEffList(cursor.Tree, &sideEffects);
+        if (sideEffects != nullptr)
+        {
+            *use = m_comp->gtNewOperNode(GT_COMMA, newUse->TypeGet(), sideEffects, newUse);
+        }
+        else
+        {
+            *use = newUse;
+        }
+        JITDUMP("\n      After:\n\n");
+        DISPSTMT(cursor.Stmt);
+
+        m_comp->gtSetStmtInfo(cursor.Stmt);
+        m_comp->fgSetStmtSeq(cursor.Stmt);
+        m_comp->gtUpdateStmtSideEffects(cursor.Stmt);
+    }
+
+    return true;
+}
+
+//------------------------------------------------------------------------
+// FindUpdateInsertionPoint: Find a block at which to insert the "self-update"
+// of a new primary IV introduced by strength reduction.
+//
+// Parameters:
+//   cursors - The list of cursors pointing to uses that are being replaced by
+//             the new IV
+//
+// Returns:
+//   Basic block; the insertion point is the end (before a potential
+//   terminator) of this basic block. May return null if no insertion point
+//   could be found.
+//
+BasicBlock* StrengthReductionContext::FindUpdateInsertionPoint(ArrayStack<CursorInfo>* cursors)
+{
+    // Find insertion point. It needs to post-dominate all uses we are going to
+    // replace and it needs to dominate all backedges.
+    // TODO-CQ: Canonicalizing backedges would make this simpler and work in
+    // more cases.
+
+    BasicBlock* insertionPoint = nullptr;
+    for (FlowEdge* backEdge : m_loop->BackEdges())
+    {
+        if (insertionPoint == nullptr)
+        {
+            insertionPoint = backEdge->getSourceBlock();
+        }
+        else
+        {
+            insertionPoint = m_comp->m_domTree->Intersect(insertionPoint, backEdge->getSourceBlock());
+        }
+    }
+
+    while ((insertionPoint != nullptr) && m_loop->ContainsBlock(insertionPoint) &&
+           m_loop->MayExecuteBlockMultipleTimesPerIteration(insertionPoint))
+    {
+        insertionPoint = insertionPoint->bbIDom;
+    }
+
+    if ((insertionPoint == nullptr) || !m_loop->ContainsBlock(insertionPoint))
+    {
+        return nullptr;
+    }
+
+    for (int i = 0; i < cursors->Height(); i++)
+    {
+        CursorInfo& cursor = cursors->BottomRef(i);
+
+        if (insertionPoint == cursor.Block)
+        {
+            if (insertionPoint->HasTerminator() && (cursor.Stmt == insertionPoint->lastStmt()))
+            {
+                return nullptr;
+            }
+        }
+        else
+        {
+            if (m_comp->optReachable(cursor.Block, m_loop->GetHeader(), insertionPoint))
+            {
+                // Header is reachable without going through the insertion
+                // point, meaning that the insertion point does not
+                // post-dominate the use of an IV we want to replace.
+                //
+                // TODO-CQ: We only need to check whether the header is
+                // reachable from inside the loop, which is both cheaper and
+                // less conservative to check.
+                //
+                return nullptr;
+            }
+        }
+    }
+
+    return insertionPoint;
+}
+
 //------------------------------------------------------------------------
 // optInductionVariables: Try and optimize induction variables in the method.
 //
@@ -1200,6 +1924,13 @@ PhaseStatus Compiler::optInductionVariables()
             continue;
         }
 
+        StrengthReductionContext strengthReductionContext(this, scevContext, loop, loopLocals);
+        if (strengthReductionContext.TryStrengthReduce())
+        {
+            Metrics.LoopsStrengthReduced++;
+            changed = true;
+        }
+
         if (optMakeLoopDownwardsCounted(scevContext, loop, &loopLocals))
         {
             Metrics.LoopsMadeDownwardsCounted++;
@@ -1212,6 +1943,8 @@ PhaseStatus Compiler::optInductionVariables()
 #if defined(TARGET_XARCH) && defined(TARGET_64BIT)
         int numWidened = 0;
 
+        JITDUMP("Considering primary IVs of " FMT_LP " for widening\n", loop->GetIndex());
+
         for (Statement* stmt : loop->GetHeader()->Statements())
         {
             if (!stmt->IsPhiDefnStmt())
diff --git a/src/coreclr/jit/jitconfigvalues.h b/src/coreclr/jit/jitconfigvalues.h
index c83fdb48fb326..50824303d2f78 100644
--- a/src/coreclr/jit/jitconfigvalues.h
+++ b/src/coreclr/jit/jitconfigvalues.h
@@ -772,6 +772,9 @@ RELEASE_CONFIG_INTEGER(JitEnableCrossBlockLocalAssertionProp, W("JitEnableCrossB
 // Do greedy RPO-based layout in Compiler::fgReorderBlocks.
 RELEASE_CONFIG_INTEGER(JitDoReversePostOrderLayout, W("JitDoReversePostOrderLayout"), 1);
 
+// Enable strength reduction
+RELEASE_CONFIG_INTEGER(JitEnableStrengthReduction, W("JitEnableStrengthReduction"), 0)
+
 // JitFunctionFile: Name of a file that contains a list of functions. If the currently compiled function is in the
 // file, certain other JIT config variables will be active. If the currently compiled function is not in the file,
 // the specific JIT config variables will not be active.
diff --git a/src/coreclr/jit/jitmetadatalist.h b/src/coreclr/jit/jitmetadatalist.h
index 10da7249177f8..4aad9abd1b3cc 100644
--- a/src/coreclr/jit/jitmetadatalist.h
+++ b/src/coreclr/jit/jitmetadatalist.h
@@ -36,6 +36,7 @@ JITMETADATAMETRIC(LoopsAligned,                          int,              0)
 JITMETADATAMETRIC(LoopsIVWidened,                        int,              0)
 JITMETADATAMETRIC(WidenedIVs,                            int,              0)
 JITMETADATAMETRIC(LoopsMadeDownwardsCounted,             int,              0)
+JITMETADATAMETRIC(LoopsStrengthReduced,                  int,              0)
 JITMETADATAMETRIC(VarsInSsa,                             int,              0)
 JITMETADATAMETRIC(HoistedExpressions,                    int,              0)
 JITMETADATAMETRIC(RedundantBranchesEliminated,           int,              JIT_METADATA_HIGHER_IS_BETTER)
diff --git a/src/coreclr/jit/scev.cpp b/src/coreclr/jit/scev.cpp
index e1c65aa11fbc2..9053d47bbf55d 100644
--- a/src/coreclr/jit/scev.cpp
+++ b/src/coreclr/jit/scev.cpp
@@ -220,6 +220,61 @@ bool Scev::IsInvariant()
     return result != ScevVisit::Abort;
 }
 
+//------------------------------------------------------------------------
+// Scev::Equals: Check if two SCEV trees are equal.
+//
+// Parameters:
+//   left  - First scev
+//   right - Second scev
+//
+// Returns:
+//   True if they represent the same value; otherwise false.
+//
+bool Scev::Equals(Scev* left, Scev* right)
+{
+    if (left == right)
+    {
+        return true;
+    }
+
+    if ((left->Oper != right->Oper) || (left->Type != right->Type))
+    {
+        return false;
+    }
+
+    switch (left->Oper)
+    {
+        case ScevOper::Constant:
+            return static_cast<ScevConstant*>(left)->Value == static_cast<ScevConstant*>(right)->Value;
+        case ScevOper::Local:
+        {
+            ScevLocal* leftLocal  = static_cast<ScevLocal*>(left);
+            ScevLocal* rightLocal = static_cast<ScevLocal*>(right);
+            return (leftLocal->LclNum == rightLocal->LclNum) && (leftLocal->SsaNum == rightLocal->SsaNum);
+        }
+        case ScevOper::ZeroExtend:
+        case ScevOper::SignExtend:
+            return Scev::Equals(static_cast<ScevUnop*>(left)->Op1, static_cast<ScevUnop*>(right)->Op1);
+        case ScevOper::Add:
+        case ScevOper::Mul:
+        case ScevOper::Lsh:
+        {
+            ScevBinop* leftBinop  = static_cast<ScevBinop*>(left);
+            ScevBinop* rightBinop = static_cast<ScevBinop*>(right);
+            return Scev::Equals(leftBinop->Op1, rightBinop->Op1) && Scev::Equals(leftBinop->Op2, rightBinop->Op2);
+        }
+        case ScevOper::AddRec:
+        {
+            ScevAddRec* leftAddRec  = static_cast<ScevAddRec*>(left);
+            ScevAddRec* rightAddRec = static_cast<ScevAddRec*>(right);
+            return Scev::Equals(leftAddRec->Start, rightAddRec->Start) &&
+                   Scev::Equals(leftAddRec->Step, rightAddRec->Step);
+        }
+        default:
+            unreached();
+    }
+}
+
 //------------------------------------------------------------------------
 // ScalarEvolutionContext: Construct an instance of a context to do scalar evolution in.
 //
@@ -312,7 +367,26 @@ ScevUnop* ScalarEvolutionContext::NewExtension(ScevOper oper, var_types targetTy
 ScevBinop* ScalarEvolutionContext::NewBinop(ScevOper oper, Scev* op1, Scev* op2)
 {
     assert((op1 != nullptr) && (op2 != nullptr));
-    ScevBinop* binop = new (m_comp, CMK_LoopIVOpts) ScevBinop(oper, op1->Type, op1, op2);
+    var_types resultType = op1->Type;
+    if (oper == ScevOper::Add)
+    {
+        if (varTypeIsGC(op1->Type))
+        {
+            assert(op2->Type == TYP_I_IMPL);
+            resultType = TYP_BYREF;
+        }
+        else if (varTypeIsGC(op2->Type))
+        {
+            assert(op1->Type == TYP_I_IMPL);
+            resultType = TYP_BYREF;
+        }
+        else
+        {
+            assert(op1->Type == op2->Type);
+        }
+    }
+
+    ScevBinop* binop = new (m_comp, CMK_LoopIVOpts) ScevBinop(oper, resultType, op1, op2);
     return binop;
 }
 
@@ -418,6 +492,12 @@ Scev* ScalarEvolutionContext::AnalyzeNew(BasicBlock* block, GenTree* tree, int d
             LclVarDsc*    dsc    = m_comp->lvaGetDesc(tree->AsLclVarCommon());
             LclSsaVarDsc* ssaDsc = dsc->GetPerSsaData(tree->AsLclVarCommon()->GetSsaNum());
 
+            if ((tree->TypeGet() != dsc->TypeGet()) || varTypeIsSmall(tree))
+            {
+                // TODO: Truncations (for TYP_INT uses of TYP_LONG locals) and NOL handling?
+                return nullptr;
+            }
+
             if ((ssaDsc->GetBlock() == nullptr) || !m_loop->ContainsBlock(ssaDsc->GetBlock()))
             {
                 return NewLocal(tree->AsLclVarCommon()->GetLclNum(), tree->AsLclVarCommon()->GetSsaNum());
@@ -602,6 +682,15 @@ Scev* ScalarEvolutionContext::AnalyzeNew(BasicBlock* block, GenTree* tree, int d
                     oper = ScevOper::Add;
                     break;
                 case GT_SUB:
+                    if (varTypeIsGC(op2->Type))
+                    {
+                        // We represent x - y as x + (-1)*y, which does not
+                        // work if y is a GC type. If we wanted to support this
+                        // we would need to add an explicit ScevOper::Sub
+                        // operator.
+                        return nullptr;
+                    }
+
                     oper = ScevOper::Add;
                     op2  = NewBinop(ScevOper::Mul, op2, NewConstant(op2->Type, -1));
                     break;
@@ -909,11 +998,14 @@ static T FoldBinop(ScevOper oper, T op1, T op2)
     }
 }
 
+const SimplificationAssumptions ScalarEvolutionContext::NoAssumptions;
+
 //------------------------------------------------------------------------
 // Simplify: Try to simplify a SCEV node by folding and canonicalization.
 //
 // Parameters:
-//   scev - The node
+//   scev        - The node
+//   assumptions - Assumptions that the simplification procedure can use.
 //
 // Returns:
 //   Simplified node.
@@ -925,7 +1017,7 @@ static T FoldBinop(ScevOper oper, T op1, T op2)
 //   Simple unops/binops on constants are folded. Operands are distributed into
 //   add recs whenever possible.
 //
-Scev* ScalarEvolutionContext::Simplify(Scev* scev)
+Scev* ScalarEvolutionContext::Simplify(Scev* scev, const SimplificationAssumptions& assumptions)
 {
     switch (scev->Oper)
     {
@@ -950,7 +1042,7 @@ Scev* ScalarEvolutionContext::Simplify(Scev* scev)
             ScevUnop* unop = (ScevUnop*)scev;
             assert(genTypeSize(unop->Type) >= genTypeSize(unop->Op1->Type));
 
-            Scev* op1 = Simplify(unop->Op1);
+            Scev* op1 = Simplify(unop->Op1, assumptions);
 
             if (unop->Type == op1->Type)
             {
@@ -966,6 +1058,27 @@ Scev* ScalarEvolutionContext::Simplify(Scev* scev)
                                                                                   : (int64_t)(int32_t)cns->Value);
             }
 
+            if (op1->OperIs(ScevOper::AddRec))
+            {
+                ScevAddRec* addRec = (ScevAddRec*)op1;
+
+                // We need to guarantee that
+                // ext(<L, start, step>) = <L, ext(start), ext(step)> to distribute the extension.
+                //
+                // Equivalently this is the case iff
+                // forall i < backedgeTakenCount, ext(start + step * i) == ext(start) + ext(step) * i.
+                //
+                // For zext: we must guarantee that 0 <= start + step * i < 2^32.
+                // For sext: we must guarantee that -2^31 <= start + step * i < 2^31.
+                //
+                if (!AddRecMayOverflow(addRec, unop->OperIs(ScevOper::SignExtend), assumptions))
+                {
+                    Scev* newStart = Simplify(NewExtension(unop->Oper, TYP_LONG, addRec->Start), assumptions);
+                    Scev* newStep  = Simplify(NewExtension(unop->Oper, TYP_LONG, addRec->Step), assumptions);
+                    return NewAddRec(newStart, newStep);
+                }
+            }
+
             return (op1 == unop->Op1) ? unop : NewExtension(unop->Oper, unop->Type, op1);
         }
         case ScevOper::Add:
@@ -973,8 +1086,8 @@ Scev* ScalarEvolutionContext::Simplify(Scev* scev)
         case ScevOper::Lsh:
         {
             ScevBinop* binop = (ScevBinop*)scev;
-            Scev*      op1   = Simplify(binop->Op1);
-            Scev*      op2   = Simplify(binop->Op2);
+            Scev*      op1   = Simplify(binop->Op1, assumptions);
+            Scev*      op2   = Simplify(binop->Op2, assumptions);
 
             if (binop->OperIs(ScevOper::Add, ScevOper::Mul))
             {
@@ -995,9 +1108,9 @@ Scev* ScalarEvolutionContext::Simplify(Scev* scev)
                 // <L, start, step> + x => <L, start + x, step>
                 // <L, start, step> * x => <L, start * x, step * x>
                 ScevAddRec* addRec   = (ScevAddRec*)op1;
-                Scev*       newStart = Simplify(NewBinop(binop->Oper, addRec->Start, op2));
+                Scev*       newStart = Simplify(NewBinop(binop->Oper, addRec->Start, op2), assumptions);
                 Scev*       newStep  = scev->OperIs(ScevOper::Mul, ScevOper::Lsh)
-                                           ? Simplify(NewBinop(binop->Oper, addRec->Step, op2))
+                                           ? Simplify(NewBinop(binop->Oper, addRec->Step, op2), assumptions)
                                            : addRec->Step;
                 return NewAddRec(newStart, newStep);
             }
@@ -1037,7 +1150,7 @@ Scev* ScalarEvolutionContext::Simplify(Scev* scev)
                     {
                         ScevBinop* newOp2 = NewBinop(ScevOper::Add, ((ScevBinop*)op1)->Op2, cns2);
                         ScevBinop* newAdd = NewBinop(ScevOper::Add, ((ScevBinop*)op1)->Op1, newOp2);
-                        return Simplify(newAdd);
+                        return Simplify(newAdd, assumptions);
                     }
                 }
 
@@ -1060,7 +1173,7 @@ Scev* ScalarEvolutionContext::Simplify(Scev* scev)
                     {
                         ScevBinop* newOp2 = NewBinop(ScevOper::Mul, ((ScevBinop*)op1)->Op2, cns2);
                         ScevBinop* newMul = NewBinop(ScevOper::Mul, ((ScevBinop*)op1)->Op1, newOp2);
-                        return Simplify(newMul);
+                        return Simplify(newMul, assumptions);
                     }
                 }
             }
@@ -1082,7 +1195,7 @@ Scev* ScalarEvolutionContext::Simplify(Scev* scev)
                     ScevBinop* newOp1 = NewBinop(ScevOper::Add, ((ScevBinop*)op1)->Op1, ((ScevBinop*)op2)->Op1);
                     ScevBinop* newOp2 = NewBinop(ScevOper::Add, ((ScevBinop*)op1)->Op2, ((ScevBinop*)op2)->Op2);
                     ScevBinop* newAdd = NewBinop(ScevOper::Add, newOp1, newOp2);
-                    return Simplify(newAdd);
+                    return Simplify(newAdd, assumptions);
                 }
             }
 
@@ -1091,8 +1204,8 @@ Scev* ScalarEvolutionContext::Simplify(Scev* scev)
         case ScevOper::AddRec:
         {
             ScevAddRec* addRec = (ScevAddRec*)scev;
-            Scev*       start  = Simplify(addRec->Start);
-            Scev*       step   = Simplify(addRec->Step);
+            Scev*       start  = Simplify(addRec->Start, assumptions);
+            Scev*       step   = Simplify(addRec->Step, assumptions);
             return (start == addRec->Start) && (step == addRec->Step) ? addRec : NewAddRec(start, step);
         }
         default:
@@ -1119,7 +1232,31 @@ bool ScalarEvolutionContext::Materialize(Scev* scev, bool createIR, GenTree** re
         case ScevOper::Constant:
         {
             ScevConstant* cns = (ScevConstant*)scev;
-            *resultVN         = m_comp->vnStore->VNForGenericCon(scev->Type, reinterpret_cast<uint8_t*>(&cns->Value));
+            if (cns->TypeIs(TYP_REF))
+            {
+                if (cns->Value != 0)
+                {
+                    // TODO-CQ: Proper handling for handles
+                    return false;
+                }
+
+                *resultVN = m_comp->vnStore->VNForNull();
+            }
+            else if (cns->TypeIs(TYP_BYREF))
+            {
+                if (cns->Value != 0)
+                {
+                    // TODO-CQ: Proper handling for handles
+                    return false;
+                }
+
+                *resultVN = m_comp->vnStore->VNForNull();
+            }
+            else
+            {
+                *resultVN = m_comp->vnStore->VNForGenericCon(scev->Type, reinterpret_cast<uint8_t*>(&cns->Value));
+            }
+
             if (createIR)
             {
                 if (scev->TypeIs(TYP_LONG))
@@ -1485,6 +1622,77 @@ bool ScalarEvolutionContext::MayOverflowBeforeExit(ScevAddRec* lhs, Scev* rhs, V
     return result != RelopEvaluationResult::True;
 }
 
+//------------------------------------------------------------------------
+// AddRecMayOverflow:
+//   Check if an add recurrence may overflow inside the containing loop.
+//
+// Parameters:
+//   addRec      - The add recurrence
+//   signedBound - Whether to check using signed (true) or unsigned (false) bounds.
+//   assumptions - Assumptions about the containing loop.
+//
+// Returns:
+//   True if the add recurrence may overflow and wrap around. False if we were
+//   able to prove that it cannot.
+//
+// Remarks:
+//   May return true conservatively.
+//
+bool ScalarEvolutionContext::AddRecMayOverflow(ScevAddRec*                      addRec,
+                                               bool                             signedBound,
+                                               const SimplificationAssumptions& assumptions)
+{
+    if (assumptions.NumBackEdgeTakenBound == 0)
+    {
+        return true;
+    }
+
+    if (!addRec->TypeIs(TYP_INT))
+    {
+        return true;
+    }
+
+    // In general we are interested in proving that the add recurrence does not
+    // cross the minimum or maximum bounds during the iteration of the loop:
+    //
+    // For signed bounds   (sext): sext(a + b) != sext(a) + sext(b) if a + b crosses -2^31 or 2^31 - 1.
+    // For unsigned bounds (zext): zext(a + b) != zext(a) + zext(b) if a + b crosses 0 or 2^32 - 1.
+    //
+    // We need to verify this condition for all i < bound where a = start, b =
+    // step + i.
+    //
+    // For now, we only handle the super duper simple case of unsigned bounds
+    // with addRec = <L, 0, 1> and a TYP_INT bound.
+    //
+    if (signedBound)
+    {
+        return true;
+    }
+
+    int64_t startCns;
+    if (addRec->Start->GetConstantValue(m_comp, &startCns) && (startCns != 0))
+    {
+        return true;
+    }
+
+    int64_t stepCns;
+    if (!addRec->Step->GetConstantValue(m_comp, &stepCns) || (stepCns != 1))
+    {
+        return true;
+    }
+
+    for (unsigned i = 0; i < assumptions.NumBackEdgeTakenBound; i++)
+    {
+        Scev* bound = assumptions.BackEdgeTakenBound[i];
+        if (bound->TypeIs(TYP_INT))
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
 //------------------------------------------------------------------------
 // MapRelopToVNFunc:
 //   Given a potentially unsigned IR relop, map it to a VNFunc.
diff --git a/src/coreclr/jit/scev.h b/src/coreclr/jit/scev.h
index cc4bcb6cfb4ac..e965137b90f1a 100644
--- a/src/coreclr/jit/scev.h
+++ b/src/coreclr/jit/scev.h
@@ -74,6 +74,8 @@ struct Scev
     ScevVisit Visit(TVisitor visitor);
 
     bool IsInvariant();
+
+    static bool Equals(Scev* left, Scev* right);
 };
 
 struct ScevConstant : Scev
@@ -201,6 +203,14 @@ enum class RelopEvaluationResult
 
 typedef JitHashTable<GenTree*, JitPtrKeyFuncs<GenTree>, Scev*> ScalarEvolutionMap;
 
+struct SimplificationAssumptions
+{
+    // A bound on the number of times a backedge will be taken; the backedge
+    // taken count is <= min(BackEdgeTakenBound).
+    Scev**   BackEdgeTakenBound    = nullptr;
+    unsigned NumBackEdgeTakenBound = 0;
+};
+
 // Scalar evolution is analyzed in the context of a single loop, and are
 // computed on-demand by the use of the "Analyze" method on this class, which
 // also maintains a cache.
@@ -230,8 +240,10 @@ class ScalarEvolutionContext
     VNFunc                MapRelopToVNFunc(genTreeOps oper, bool isUnsigned);
     RelopEvaluationResult EvaluateRelop(ValueNum relop);
     bool                  MayOverflowBeforeExit(ScevAddRec* lhs, Scev* rhs, VNFunc exitOp);
+    bool AddRecMayOverflow(ScevAddRec* addRec, bool signedBound, const SimplificationAssumptions& assumptions);
 
     bool Materialize(Scev* scev, bool createIR, GenTree** result, ValueNum* resultVN);
+
 public:
     ScalarEvolutionContext(Compiler* comp);
 
@@ -244,7 +256,9 @@ class ScalarEvolutionContext
     ScevAddRec*   NewAddRec(Scev* start, Scev* step);
 
     Scev* Analyze(BasicBlock* block, GenTree* tree);
-    Scev* Simplify(Scev* scev);
+
+    static const SimplificationAssumptions NoAssumptions;
+    Scev* Simplify(Scev* scev, const SimplificationAssumptions& assumptions = NoAssumptions);
 
     Scev* ComputeExitNotTakenCount(BasicBlock* exiting);