Skip to content

[LV] Fix crash in epilog vectorization#185480

Open
annamthomas wants to merge 1 commit intollvm:mainfrom
annamthomas:179407_fix
Open

[LV] Fix crash in epilog vectorization#185480
annamthomas wants to merge 1 commit intollvm:mainfrom
annamthomas:179407_fix

Conversation

@annamthomas
Copy link
Copy Markdown
Contributor

Fixes #179407.

When vectorizing epilogues, fixScalarResumeValuesFromBypass and fixReductionScalarResumeWhenVectorizingEpilog assumed that induction PHI incoming values from the preheader are always PHINodes.

Modified version of #179459.

Fixes llvm#179407.

When vectorizing epilogues, fixScalarResumeValuesFromBypass and
fixReductionScalarResumeWhenVectorizingEpilog assumed that induction
PHI incoming values from the preheader are always PHINodes.

Modified version of llvm#179459.
@llvmbot
Copy link
Copy Markdown
Member

llvmbot commented Mar 9, 2026

@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: None (annamthomas)

Changes

Fixes #179407.

When vectorizing epilogues, fixScalarResumeValuesFromBypass and fixReductionScalarResumeWhenVectorizingEpilog assumed that induction PHI incoming values from the preheader are always PHINodes.

Modified version of #179459.


Full diff: https://github.com/llvm/llvm-project/pull/185480.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+16-4)
  • (added) llvm/test/Transforms/LoopVectorize/pr179407.ll (+110)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index bb4eef5a41c09..95580b38d7337 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7373,8 +7373,10 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
           VPlanPatternMatch::m_Select(VPlanPatternMatch::m_VPValue(),
                                       VPlanPatternMatch::m_VPValue(BackedgeVal),
                                       VPlanPatternMatch::m_VPValue()));
-    EpiRedHeaderPhi = cast<VPReductionPHIRecipe>(
+    EpiRedHeaderPhi = cast_if_present<VPReductionPHIRecipe>(
         vputils::findRecipe(BackedgeVal, IsaPred<VPReductionPHIRecipe>));
+    if (!EpiRedHeaderPhi)
+      return;
   }
 
   Value *MainResumeValue;
@@ -9345,12 +9347,22 @@ static void fixScalarResumeValuesFromBypass(BasicBlock *BypassBlock, Loop *L,
   // Fix induction resume values from the additional bypass block.
   IRBuilder<> BypassBuilder(BypassBlock, BypassBlock->getFirstInsertionPt());
   for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
-    auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
+    Value *IncomingFromPH = IVPhi->getIncomingValueForBlock(PH);
     Value *V = createInductionAdditionalBypassValues(
         IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount,
         LVL.getPrimaryInduction());
-    // TODO: Directly add as extra operand to the VPResumePHI recipe.
-    Inc->setIncomingValueForBlock(BypassBlock, V);
+
+    if (auto *Inc = dyn_cast<PHINode>(IncomingFromPH)) {
+      // TODO: Directly add as extra operand to the VPResumePHI recipe.
+      Inc->setIncomingValueForBlock(BypassBlock, V);
+    } else {
+      // If the incoming value from preheader is not a PHI node (e.g., a
+      // constant in functions with GC statepoint), directly add an incoming
+      // value from BypassBlock to IVPhi. The incoming value from PH remains
+      // unchanged (IncomingFromPH).
+      if (IVPhi->getBasicBlockIndex(BypassBlock) == -1)
+        IVPhi->addIncoming(V, BypassBlock);
+    }
   }
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/pr179407.ll b/llvm/test/Transforms/LoopVectorize/pr179407.ll
new file mode 100644
index 0000000000000..54b767d5ff3b4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/pr179407.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization -force-vector-width=2 -epilogue-vectorization-force-VF=2 -S < %s | FileCheck %s
+;
+; Test case for issue #179407: LoopVectorize should not crash during
+; epilogue vectorization when the preheader has been simplified.
+
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @wombat(i64 %arg) {
+; CHECK-LABEL: define void @wombat(
+; CHECK-SAME: i64 [[ARG:%.*]]) {
+; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[ARG]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 2
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
+; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = phi <2 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[BIN_RDX5]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI4:%.*]] = phi <2 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[VEC_PHI4]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = or <2 x i32> [[VEC_PHI4]], [[BIN_RDX5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[BIN_RDX6]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze i1 [[TMP3]]
+; CHECK-NEXT:    [[RDX_SELECT:%.*]] = select i1 [[TMP4]], i32 0, i32 0
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[BB5:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
+; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_PH]]:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP2]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_MOD_VF7:%.*]] = urem i64 [[TMP0]], 2
+; CHECK-NEXT:    [[N_VEC8:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x i32> zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0
+; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
+; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI10:%.*]] = phi <2 x i32> [ [[TMP5]], %[[VEC_EPILOG_PH]] ], [ [[VEC_PHI10]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX_NEXT11]] = add nuw i64 [[INDEX9]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC8]]
+; CHECK-NEXT:    br i1 [[TMP6]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> [[VEC_PHI10]])
+; CHECK-NEXT:    [[TMP8:%.*]] = call i1 @llvm.vector.reduce.or.v2i1(<2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP9:%.*]] = freeze i1 [[TMP8]]
+; CHECK-NEXT:    [[RDX_SELECT12:%.*]] = select i1 [[TMP9]], i32 0, i32 0
+; CHECK-NEXT:    [[CMP_N13:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]]
+; CHECK-NEXT:    br i1 [[CMP_N13]], label %[[BB5]], label %[[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX10:%.*]] = phi i32 [ [[TMP7]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP2]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX11:%.*]] = phi i32 [ [[RDX_SELECT12]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br label %[[BB1:.*]]
+; CHECK:       [[BB1]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], %[[BB1]] ]
+; CHECK-NEXT:    [[PHI2:%.*]] = phi i32 [ [[BC_MERGE_RDX10]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[OR:%.*]], %[[BB1]] ]
+; CHECK-NEXT:    [[PHI3:%.*]] = phi i32 [ [[BC_MERGE_RDX11]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SELECT:%.*]], %[[BB1]] ]
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i32 0, 0
+; CHECK-NEXT:    [[SELECT]] = select i1 [[ICMP]], i32 0, i32 [[PHI3]]
+; CHECK-NEXT:    [[OR]] = or i32 [[PHI2]], 0
+; CHECK-NEXT:    [[ADD]] = add i64 [[PHI]], 1
+; CHECK-NEXT:    [[ICMP4:%.*]] = icmp ult i64 [[PHI]], [[ARG]]
+; CHECK-NEXT:    br i1 [[ICMP4]], label %[[BB1]], label %[[BB5]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       [[BB5]]:
+; CHECK-NEXT:    [[PHI6:%.*]] = phi i32 [ [[SELECT]], %[[BB1]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ [[RDX_SELECT12]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[PHI7:%.*]] = phi i32 [ [[OR]], %[[BB1]] ], [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[TMP7]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret void
+;
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %phi = phi i64 [ 0, %bb ], [ %add, %bb1 ]
+  %phi2 = phi i32 [ 0, %bb ], [ %or, %bb1 ]
+  %phi3 = phi i32 [ 0, %bb ], [ %select, %bb1 ]
+  %icmp = icmp eq i32 0, 0
+  %select = select i1 %icmp, i32 0, i32 %phi3
+  %or = or i32 %phi2, 0
+  %add = add i64 %phi, 1
+  %icmp4 = icmp ult i64 %phi, %arg
+  br i1 %icmp4, label %bb1, label %bb5
+
+bb5:                                              ; preds = %bb1
+  %phi6 = phi i32 [ %select, %bb1 ]
+  %phi7 = phi i32 [ %or, %bb1 ]
+  ret void
+}
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[PROF3]] = !{!"branch_weights", i32 2, i32 2}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.

@annamthomas
Copy link
Copy Markdown
Contributor Author

@fhahn, @david-arm Could I please get a review for this? We're having crashes downstream and are blocked.

Copy link
Copy Markdown
Contributor

@fhahn fhahn left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@annamthomas it looks like the patch needs a rebase.



target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
target triple = "x86_64-unknown-linux-gnu"
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it this depends on the target, it needs to go into the X86 sub-folder

; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
; CHECK: [[VECTOR_BODY]]:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; CHECK-NEXT: [[BIN_RDX5:%.*]] = phi <2 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[BIN_RDX5]], %[[VECTOR_BODY]] ]
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the underlying issue is that we fail to simplify this to the start value?

Comment on lines +7378 to +7379
if (!EpiRedHeaderPhi)
return;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure this early exit is right, we found a reduction for which we need to update the resume value, which we don't do on this path

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you're right. I'll rebase and update the fix.
We need to find the resume value here. For this test case, 0 was fine because it was a trivial reduction.

Inc->setIncomingValueForBlock(BypassBlock, V);
} else {
// If the incoming value from preheader is not a PHI node (e.g., a
// constant in functions with GC statepoint), directly add an incoming
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think the GC state point bit is relevant?

@fhahn
Copy link
Copy Markdown
Contributor

fhahn commented Mar 11, 2026

FWIW I am currently looking on re-working how we handle resume values for the scalar loop and potentially the epilogue vector loop to make this more robust, removing the need to find the resume value like we do currently. Hopefully I'll be able to share something working in the next few days.

For this particular case, it might be possible to avoid the issue by removing the trivial phi

@annamthomas
Copy link
Copy Markdown
Contributor Author

Thanks @fhahn!

I followed two different approaches:

  1. Find the “resume” value for the epilog vectorization. This “works” on couple of reduced test cases I tried. AFAICT, This isn’t the recommended approach.
  2. Simplify the “self referencing phi” created in the main body loop. IIUC, by cleaning this up, we will not even hit it in the epilog vectorization (so we don’t need to find the “resume” values). This works on the reduced test case (but we don't even seem to be vectorizing the main loop anymore). This still fails in the original IR.

@annamthomas
Copy link
Copy Markdown
Contributor Author

This is the code in simplifyRecipe in VPlanTransforms.cpp:

+  if (auto *RedPHI = dyn_cast<VPReductionPHIRecipe>(Def)) {
+    if (RedPHI->getNumOperands() == 2) {
+      VPValue *StartValue = RedPHI->getOperand(0);
+      VPValue *BackedgeValue = RedPHI->getOperand(1);
+
+      // Check if the backedge value is the reduction PHI itself (direct self-reference)
+      if (BackedgeValue == RedPHI) {
+        RedPHI->replaceAllUsesWith(StartValue);
+        return;
+      }
+    }
+  }

It removes the self-referencing phi generated by LV in this reduced test case:

define void @"test2"(i64 %0){
entry:
  %length.i381 = load atomic i32, ptr addrspace(1) null unordered, align 4
  br label %bci_456

bci_456:                                          ; preds = %bci_456, %entry
  %indvars.iv3587 = phi i64 [ 0, %entry ], [ %indvars.iv.next3588, %bci_456 ]
  %1 = phi i32 [ 0, %entry ], [ %.neg4554, %bci_456 ]
  %"$vphi.9" = phi i32 [ 0, %entry ], [ %"$vphi.8", %bci_456 ]
  %.not = icmp eq i32 %"$vphi.9", 0
  %"$vphi.8" = select i1 %.not, i32 32, i32 %"$vphi.9"
  %.neg4554 = or i32 %1, 0
  %indvars.iv.next3588 = add i64 %indvars.iv3587, 1
  %.not3460 = icmp ult i64 %indvars.iv3587, %0
  br i1 %.not3460, label %bci_456, label %main.exit.selector

main.exit.selector:                               ; preds = %bci_456
  %"$vphi.8.lcssa" = phi i32 [ %"$vphi.8", %bci_456 ]
  %.lcssa4716 = phi i32 [ %.neg4554, %bci_456 ]
  ret void
}

@fhahn
Copy link
Copy Markdown
Contributor

fhahn commented Mar 11, 2026

@annamthomas could you give #185969 a try? I think it fixes the crash for the minimal reproducer, but given that it contains trivially simplify-able phis (really non-reductions), it would be good to have a bette reproducer that cannot be fixed by simplifying the phi

@annamthomas
Copy link
Copy Markdown
Contributor Author

@annamthomas could you give #185969 a try? I think it fixes the crash for the minimal reproducer, but given that it contains trivially simplify-able phis (really non-reductions), it would be good to have a bette reproducer that cannot be fixed by simplifying the phi

Thanks @fhahn ! I tried it and it fixes two of the reduced reproducers I had. However, to try on the original IR, I'll need to cherry-pick quite a few commits along with the fix (we're a bit behind on our daily merges). I'll try this in our original IR when it catches up. I'll report if it still crashes.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

Assertion `Val && "isa<> used on a null pointer"' failed.

3 participants