JIT: Add reasoning about loop trip counts and optimize counted loops …

…into downwards counted loops (#102261) This builds out some initial reasoning about trip counts of loops and utilizes it to convert upwards counted loops into downwards counted loops when beneficial. The trip count of a loop is defined to be the number of times the header block is entered. When this value can be computed the loop is called counted. The computation here is symbolic and can reason in terms of variables, such as array or span lengths. To be able to compute the trip count requires the JIT to reason about overflow and to prove various conditions related to the start and end values of the loop. For example, a loop `for (int i = 0; i <= n; i++)` only has a determinable trip count if we can prove that `n < int.MaxValue`. The implementation here utilizes the logic provided by RBO to prove these conditions. In many cases we aren't able to prove them and thus must give up, but this should be improvable in an incremental fashion to handle common cases. Converting a counted loop to a downwards counting loop is beneficial if the induction variable is not being used for anything else but the loop test. In those cases our target platforms are able to combine the decrement with the exit test into a single instruction. More importantly this usually frees up a register inside the loop. This transformation does not have that many hits (as one can imagine, the IVs of loops are usually used for something else). However, once strength reduction is implemented we expect that this transformation will be significantly more important since strength reduction in many cases is going to remove all uses of an IV except the mutation and the loop test. The reasoning about trip counts is itself also needed by strength reduction which also needs it to prove no overflow in various cases. TP regressions are going to be pretty large for this change: - This enables DFS tree/loop finding in IV opts phase outside win-x64, which has cost around 0.4% TP on its own - This optimization furthermore requires us to build dominators, which comes with its own TP cost Long term we could remove these costs if we could avoid changing control flow in assertion prop and move RBO to the end of the opts loop (letting all control flow changes happen there). But for now I think we just have to pay some of the costs to allow us to do these optimizations. Example: ```csharp private static int Foo(int[] arr, int start, int count) { int sum = 0; for (int i = 0; i < count; i++) { sum += arr[start]; start++; } return sum; } ``` ```diff @@ -1,19 +1,17 @@ G_M42127_IG02: ;; offset=0x0004 xor eax, eax - xor r10d, r10d test r8d, r8d jle SHORT G_M42127_IG05 - ;; size=10 bbWeight=1 PerfScore 1.75 -G_M42127_IG03: ;; offset=0x000E - mov r9d, dword ptr [rcx+0x08] + ;; size=7 bbWeight=1 PerfScore 1.50 +G_M42127_IG03: ;; offset=0x000B + mov r10d, dword ptr [rcx+0x08] mov edx, edx ;; size=6 bbWeight=0.25 PerfScore 0.56 -G_M42127_IG04: ;; offset=0x0014 +G_M42127_IG04: ;; offset=0x0011 cmp edx, dword ptr [rcx+0x08] jae SHORT G_M42127_IG06 add eax, dword ptr [rcx+4*rdx+0x10] inc edx - inc r10d - cmp r10d, r8d - jl SHORT G_M42127_IG04 - ;; size=19 bbWeight=4 PerfScore 35.00 + dec r8d + jne SHORT G_M42127_IG04 + ;; size=16 bbWeight=4 PerfScore 34.00 ``` Fix #100915
dotnet · May 30, 2024 · 1ded19e · 1ded19e
1 parent 36a85b0
commit 1ded19e
Show file tree

Hide file tree

Showing 10 changed files with 1,215 additions and 67 deletions.
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
@@ -72,22 +72,21 @@ inline var_types genActualType(T value);
  *                  Forward declarations
  */
 
-struct InfoHdr;              // defined in GCInfo.h
-struct escapeMapping_t;      // defined in fgdiagnostic.cpp
-class emitter;               // defined in emit.h
-struct ShadowParamVarInfo;   // defined in GSChecks.cpp
-struct InitVarDscInfo;       // defined in registerargconvention.h
-class FgStack;               // defined in fgbasic.cpp
-class Instrumentor;          // defined in fgprofile.cpp
-class SpanningTreeVisitor;   // defined in fgprofile.cpp
-class CSE_DataFlow;          // defined in optcse.cpp
-struct CSEdsc;               // defined in optcse.h
-class CSE_HeuristicCommon;   // defined in optcse.h
-class OptBoolsDsc;           // defined in optimizer.cpp
-struct RelopImplicationInfo; // defined in redundantbranchopts.cpp
-struct JumpThreadInfo;       // defined in redundantbranchopts.cpp
-class ProfileSynthesis;      // defined in profilesynthesis.h
-class LoopLocalOccurrences;  // defined in inductionvariableopts.cpp
+struct InfoHdr;             // defined in GCInfo.h
+struct escapeMapping_t;     // defined in fgdiagnostic.cpp
+class emitter;              // defined in emit.h
+struct ShadowParamVarInfo;  // defined in GSChecks.cpp
+struct InitVarDscInfo;      // defined in registerargconvention.h
+class FgStack;              // defined in fgbasic.cpp
+class Instrumentor;         // defined in fgprofile.cpp
+class SpanningTreeVisitor;  // defined in fgprofile.cpp
+class CSE_DataFlow;         // defined in optcse.cpp
+struct CSEdsc;              // defined in optcse.h
+class CSE_HeuristicCommon;  // defined in optcse.h
+class OptBoolsDsc;          // defined in optimizer.cpp
+struct JumpThreadInfo;      // defined in redundantbranchopts.cpp
+class ProfileSynthesis;     // defined in profilesynthesis.h
+class LoopLocalOccurrences; // defined in inductionvariableopts.cpp
 #ifdef DEBUG
 struct IndentStack;
 #endif
@@ -2187,6 +2186,8 @@ class FlowGraphNaturalLoop
         return m_exitEdges[index];
     }
 
+    BasicBlock* GetPreheader() const;
+
     unsigned GetDepth() const;
 
     bool ContainsBlock(BasicBlock* block);
@@ -2495,6 +2496,30 @@ enum class NodeThreading
     LIR,       // Nodes are in LIR form (after rationalization)
 };
 
+//------------------------------------------------------------------------
+// RelopImplicationInfo
+//
+// Describes information needed to check for and describe the
+// inferences between two relops.
+//
+struct RelopImplicationInfo
+{
+    // Dominating relop, whose value may be determined by control flow
+    ValueNum domCmpNormVN = ValueNumStore::NoVN;
+    // Dominated relop, whose value we would like to determine
+    ValueNum treeNormVN = ValueNumStore::NoVN;
+    // Relationship between the two relops, if any
+    ValueNumStore::VN_RELATION_KIND vnRelation = ValueNumStore::VN_RELATION_KIND::VRK_Same;
+    // Can we draw an inference?
+    bool canInfer = false;
+    // If canInfer and dominating relop is true, can we infer value of dominated relop?
+    bool canInferFromTrue = true;
+    // If canInfer and dominating relop is false, can we infer value of dominated relop?
+    bool canInferFromFalse = true;
+    // Reverse the sense of the inference
+    bool reverseSense = false;
+};
+
 /*
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
 XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
@@ -7521,10 +7546,14 @@ class Compiler
 #endif
 
     PhaseStatus optInductionVariables();
-    bool        optWidenPrimaryIV(FlowGraphNaturalLoop* loop,
-                                  unsigned              lclNum,
-                                  ScevAddRec*           addRec,
-                                  LoopLocalOccurrences* loopLocals);
+
+    bool optMakeLoopDownwardsCounted(ScalarEvolutionContext& scevContext,
+                                     FlowGraphNaturalLoop*   loop,
+                                     LoopLocalOccurrences*   loopLocals);
+    bool optWidenPrimaryIV(FlowGraphNaturalLoop* loop,
+                           unsigned              lclNum,
+                           ScevAddRec*           addRec,
+                           LoopLocalOccurrences* loopLocals);
 
     bool optCanSinkWidenedIV(unsigned lclNum, FlowGraphNaturalLoop* loop);
     bool optIsIVWideningProfitable(unsigned              lclNum,
@@ -10201,6 +10230,7 @@ class Compiler
         STRESS_MODE(UNWIND) /* stress unwind info; e.g., create function fragments */           \
         STRESS_MODE(OPT_REPEAT) /* stress JitOptRepeat */                                       \
         STRESS_MODE(INITIAL_PARAM_REG) /* Stress initial register assigned to parameters */     \
+        STRESS_MODE(DOWNWARDS_COUNTED_LOOPS) /* Make more loops downwards counted         */    \
                                                                                                 \
         /* After COUNT_VARN, stress level 2 does all of these all the time */                   \
                                                                                                 \

diff --git a/src/coreclr/jit/flowgraph.cpp b/src/coreclr/jit/flowgraph.cpp
@@ -4141,6 +4141,28 @@ FlowGraphNaturalLoop::FlowGraphNaturalLoop(const FlowGraphDfsTree* dfsTree, Basi
 {
 }
 
+//------------------------------------------------------------------------
+// GetPreheader: Get the preheader of this loop, if it has one.
+//
+// Returns:
+//   The preheader, or nullptr if there is no preheader.
+//
+BasicBlock* FlowGraphNaturalLoop::GetPreheader() const
+{
+    if (m_entryEdges.size() != 1)
+    {
+        return nullptr;
+    }
+
+    BasicBlock* preheader = m_entryEdges[0]->getSourceBlock();
+    if (!preheader->KindIs(BBJ_ALWAYS))
+    {
+        return nullptr;
+    }
+
+    return preheader;
+}
+
 //------------------------------------------------------------------------
 // GetDepth: Get the depth of the loop.
 //