From 91ce52e277f97bea4ec336e142e2126446a22d3c Mon Sep 17 00:00:00 2001 From: Alireza Torabian Date: Fri, 30 May 2025 15:34:06 -0400 Subject: [PATCH] [LoopFusion] Detecting loop-carried dependencies using DA info Loop fusion pass will uses the information provided by DA to detect loop-carried dependencies and fuse the loops if it is legal. --- llvm/lib/Transforms/Scalar/LoopFuse.cpp | 29 +++ .../LoopFusion/backward_loop_carried.ll | 185 ++++++++++++++++++ llvm/test/Transforms/LoopFusion/simple.ll | 28 ++- 3 files changed, 226 insertions(+), 16 deletions(-) create mode 100644 llvm/test/Transforms/LoopFusion/backward_loop_carried.ll diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp index d6bd92d520e28..280a86ff2bf3c 100644 --- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -100,6 +100,8 @@ STATISTIC(OnlySecondCandidateIsGuarded, "The second candidate is guarded while the first one is not"); STATISTIC(NumHoistedInsts, "Number of hoisted preheader instructions."); STATISTIC(NumSunkInsts, "Number of hoisted preheader instructions."); +STATISTIC(NumDepSafeFused, "Number of fused loops with dependencies " + "proven safe based on the dependence direction"); enum FusionDependenceAnalysisChoice { FUSION_DEPENDENCE_ANALYSIS_SCEV, @@ -1349,6 +1351,33 @@ struct LoopFuser { << "\n"); } #endif + unsigned Levels = DepResult->getLevels(); + unsigned SeparateLevels = DepResult->getSeparateLevels(); + unsigned CurLoopLevel = FC0.L->getLoopDepth(); + + bool OuterEqDir = true; + for (unsigned II = 1; II <= std::min(CurLoopLevel - 1, Levels); ++II) { + unsigned Direction = DepResult->getDirection(II, II > Levels); + if (!(Direction & Dependence::DVEntry::EQ)) { + // Different accesses in the outer levels of CurLoopLevel + OuterEqDir = false; + break; + } + } + if (!OuterEqDir || CurLoopLevel > Levels + SeparateLevels) { + LLVM_DEBUG(dbgs() << "Safe to fuse with no dependency\n"); + NumDepSafeFused++; + return true; + } + + assert(CurLoopLevel > Levels && "Fusion candidates are not separated"); + unsigned CurDir = DepResult->getDirection(CurLoopLevel, true); + if (!(CurDir & Dependence::DVEntry::GT)) { + LLVM_DEBUG(dbgs() << "Safe to fuse with backward loop-carried " + "dependency\n"); + NumDepSafeFused++; + return true; + } if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor()) LLVM_DEBUG( diff --git a/llvm/test/Transforms/LoopFusion/backward_loop_carried.ll b/llvm/test/Transforms/LoopFusion/backward_loop_carried.ll new file mode 100644 index 0000000000000..d9759f7840862 --- /dev/null +++ b/llvm/test/Transforms/LoopFusion/backward_loop_carried.ll @@ -0,0 +1,185 @@ +; RUN: opt -S -passes=loop-fusion -da-disable-delinearization-checks < %s | FileCheck %s + +; The two inner loops have no dependency and are allowed to be fused as in the +; outer loops, different levels are accessed to. + +; C Code +; +;; for (long int i = 0; i < n; i++) { +;; for (long int j = 0; j < n; j++) { +;; for (long int k = 0; k < n; k++) { +;; A[i][j][k] = i; +;; } +;; for (long int k = 0; k < n; k++) { +;; temp = A[i + 3][j + 2][k + 1]; + +define void @backward_dep0(i64 %n, ptr %A) nounwind uwtable ssp { +entry: + %cmp10 = icmp sgt i64 %n, 0 + br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26 + +; CHECK-LABEL: backward_dep +; CHECK-COUNT-1: for.body{{[0-9]+}}: +; CHECK-NOT: for.body{{[0-9]+}}: + +for.cond1.preheader.preheader: ; preds = %entry + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc24 + %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ] + %cmp26 = icmp sgt i64 %n, 0 + br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24 + +for.cond4.preheader.preheader: ; preds = %for.cond1.preheader + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc21 + %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ] + %cmp51 = icmp sgt i64 %n, 0 + br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit + +for.body6.preheader: ; preds = %for.cond4.preheader + br label %for.body6 + +for.body6: ; preds = %for.body6.preheader, %for.body6 + %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ] + %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02 + store i64 %i.011, ptr %arrayidx8, align 8 + %inc = add nsw i64 %k.02, 1 + %exitcond13 = icmp ne i64 %inc, %n + br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit + +for.cond10.loopexit.loopexit: ; preds = %for.body6 + br label %for.cond10.loopexit + +for.cond10.loopexit: ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader + %cmp113 = icmp sgt i64 %n, 0 + br i1 %cmp113, label %for.body12.preheader, label %for.inc21 + +for.body12.preheader: ; preds = %for.cond10.loopexit + br label %for.body12 + +for.body12: ; preds = %for.body12.preheader, %for.body12 + %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ] + %add = add nsw i64 %k9.05, 1 + %add13 = add nsw i64 %j.07, 2 + %add14 = add nsw i64 %i.011, 3 + %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %add14, i64 %add13, i64 %add + %0 = load i64, ptr %arrayidx17, align 8 + %inc19 = add nsw i64 %k9.05, 1 + %exitcond = icmp ne i64 %inc19, %n + br i1 %exitcond, label %for.body12, label %for.inc21.loopexit + +for.inc21.loopexit: ; preds = %for.body12 + br label %for.inc21 + +for.inc21: ; preds = %for.inc21.loopexit, %for.cond10.loopexit + %inc22 = add nsw i64 %j.07, 1 + %exitcond14 = icmp ne i64 %inc22, %n + br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit + +for.inc24.loopexit: ; preds = %for.inc21 + br label %for.inc24 + +for.inc24: ; preds = %for.inc24.loopexit, %for.cond1.preheader + %inc25 = add nsw i64 %i.011, 1 + %exitcond15 = icmp ne i64 %inc25, %n + br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit + +for.end26.loopexit: ; preds = %for.inc24 + br label %for.end26 + +for.end26: ; preds = %for.end26.loopexit, %entry + ret void +} + +; The two inner loops have a backward loop-carried dependency, allowing them +; to be fused. + +; C Code +; +;; for (long int i = 0; i < n; i++) { +;; for (long int j = 0; j < n; j++) { +;; for (long int k = 0; k < n; k++) { +;; A[i][j][k] = i; +;; } +;; for (long int k = 0; k < n; k++) { +;; temp = A[i][j][k - 1]; + +define void @backward_dep1(i64 %n, ptr %A) nounwind uwtable ssp { +entry: + %cmp10 = icmp sgt i64 %n, 0 + br i1 %cmp10, label %for.cond1.preheader.preheader, label %for.end26 + +; CHECK-LABEL: backward_dep +; CHECK-COUNT-1: for.body{{[0-9]+}}: +; CHECK-NOT: for.body{{[0-9]+}}: + +for.cond1.preheader.preheader: ; preds = %entry + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.preheader, %for.inc24 + %i.011 = phi i64 [ %inc25, %for.inc24 ], [ 0, %for.cond1.preheader.preheader ] + %cmp26 = icmp sgt i64 %n, 0 + br i1 %cmp26, label %for.cond4.preheader.preheader, label %for.inc24 + +for.cond4.preheader.preheader: ; preds = %for.cond1.preheader + br label %for.cond4.preheader + +for.cond4.preheader: ; preds = %for.cond4.preheader.preheader, %for.inc21 + %j.07 = phi i64 [ %inc22, %for.inc21 ], [ 0, %for.cond4.preheader.preheader ] + %cmp51 = icmp sgt i64 %n, 0 + br i1 %cmp51, label %for.body6.preheader, label %for.cond10.loopexit + +for.body6.preheader: ; preds = %for.cond4.preheader + br label %for.body6 + +for.body6: ; preds = %for.body6.preheader, %for.body6 + %k.02 = phi i64 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ] + %arrayidx8 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %k.02 + store i64 %i.011, ptr %arrayidx8, align 8 + %inc = add nsw i64 %k.02, 1 + %exitcond13 = icmp ne i64 %inc, %n + br i1 %exitcond13, label %for.body6, label %for.cond10.loopexit.loopexit + +for.cond10.loopexit.loopexit: ; preds = %for.body6 + br label %for.cond10.loopexit + +for.cond10.loopexit: ; preds = %for.cond10.loopexit.loopexit, %for.cond4.preheader + %cmp113 = icmp sgt i64 %n, 0 + br i1 %cmp113, label %for.body12.preheader, label %for.inc21 + +for.body12.preheader: ; preds = %for.cond10.loopexit + br label %for.body12 + +for.body12: ; preds = %for.body12.preheader, %for.body12 + %k9.05 = phi i64 [ %inc19, %for.body12 ], [ 0, %for.body12.preheader ] + %add = add nsw i64 %k9.05, -1 + %arrayidx17 = getelementptr inbounds [100 x [100 x i64]], ptr %A, i64 %i.011, i64 %j.07, i64 %add + %0 = load i64, ptr %arrayidx17, align 8 + %inc19 = add nsw i64 %k9.05, 1 + %exitcond = icmp ne i64 %inc19, %n + br i1 %exitcond, label %for.body12, label %for.inc21.loopexit + +for.inc21.loopexit: ; preds = %for.body12 + br label %for.inc21 + +for.inc21: ; preds = %for.inc21.loopexit, %for.cond10.loopexit + %inc22 = add nsw i64 %j.07, 1 + %exitcond14 = icmp ne i64 %inc22, %n + br i1 %exitcond14, label %for.cond4.preheader, label %for.inc24.loopexit + +for.inc24.loopexit: ; preds = %for.inc21 + br label %for.inc24 + +for.inc24: ; preds = %for.inc24.loopexit, %for.cond1.preheader + %inc25 = add nsw i64 %i.011, 1 + %exitcond15 = icmp ne i64 %inc25, %n + br i1 %exitcond15, label %for.cond1.preheader, label %for.end26.loopexit + +for.end26.loopexit: ; preds = %for.inc24 + br label %for.end26 + +for.end26: ; preds = %for.end26.loopexit, %entry + ret void +} diff --git a/llvm/test/Transforms/LoopFusion/simple.ll b/llvm/test/Transforms/LoopFusion/simple.ll index d63890df14461..dfb3d13b56f04 100644 --- a/llvm/test/Transforms/LoopFusion/simple.ll +++ b/llvm/test/Transforms/LoopFusion/simple.ll @@ -300,40 +300,36 @@ bb23: ; preds = %bb17, %bb define void @forward_dep(ptr noalias %arg) { ; CHECK-LABEL: @forward_dep( -; CHECK-NEXT: bb: -; CHECK-NEXT: br label [[BB7:%.*]] +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: br label %[[BB7:.*]] ; CHECK: bb7: -; CHECK-NEXT: [[DOT013:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP15:%.*]], [[BB14:%.*]] ] -; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], [[BB14]] ] +; CHECK-NEXT: [[DOT013:%.*]] = phi i32 [ 0, %[[BB]] ], [ [[TMP15:%.*]], %[[BB25:.*]] ] +; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, %[[BB]] ], [ [[INDVARS_IV_NEXT3:%.*]], %[[BB25]] ] +; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[BB25]] ], [ 0, %[[BB]] ] ; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[DOT013]], -3 ; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[INDVARS_IV22]], 3 ; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[TMP8]] to i32 ; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i32 [[TMP]], [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV22]] to i32 ; CHECK-NEXT: [[TMP12:%.*]] = srem i32 [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG:%.*]], i64 [[INDVARS_IV22]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV22]] ; CHECK-NEXT: store i32 [[TMP12]], ptr [[TMP13]], align 4 -; CHECK-NEXT: br label [[BB14]] +; CHECK-NEXT: br label %[[BB14:.*]] ; CHECK: bb14: -; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1 -; CHECK-NEXT: [[TMP15]] = add nuw nsw i32 [[DOT013]], 1 -; CHECK-NEXT: [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100 -; CHECK-NEXT: br i1 [[EXITCOND4]], label [[BB7]], label [[BB19_PREHEADER:%.*]] -; CHECK: bb19.preheader: -; CHECK-NEXT: br label [[BB19:%.*]] -; CHECK: bb19: -; CHECK-NEXT: [[INDVARS_IV1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[BB25:%.*]] ], [ 0, [[BB19_PREHEADER]] ] ; CHECK-NEXT: [[TMP20:%.*]] = add nsw i64 [[INDVARS_IV1]], -3 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[TMP20]] ; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TMP21]], align 4 ; CHECK-NEXT: [[TMP23:%.*]] = mul nsw i32 [[TMP22]], 3 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[ARG]], i64 [[INDVARS_IV1]] ; CHECK-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 -; CHECK-NEXT: br label [[BB25]] +; CHECK-NEXT: br label %[[BB25]] ; CHECK: bb25: +; CHECK-NEXT: [[INDVARS_IV_NEXT3]] = add nuw nsw i64 [[INDVARS_IV22]], 1 +; CHECK-NEXT: [[TMP15]] = add nuw nsw i32 [[DOT013]], 1 +; CHECK-NEXT: [[EXITCOND4:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT3]], 100 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV1]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], 100 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[BB19]], label [[BB26:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label %[[BB7]], label %[[BB26:.*]] ; CHECK: bb26: ; CHECK-NEXT: ret void ;