From f81929e548a1810d4beb29676f660cf582dcc300 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 12 Aug 2025 15:33:04 +0000 Subject: [PATCH 1/2] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20in?= =?UTF-8?q?itial=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 45 +++++++---- .../SLPVectorizer/X86/no_alternate_divrem.ll | 27 ++++--- .../X86/vect_copyable_in_binops.ll | 77 ++++--------------- 3 files changed, 58 insertions(+), 91 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 3045eeb3eb48e..b605cddc9258b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -25457,7 +25457,7 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI, template static bool tryToVectorizeSequence( SmallVectorImpl &Incoming, function_ref Comparator, - function_ref AreCompatible, + function_ref, T *)> AreCompatible, function_ref, bool)> TryToVectorizeHelper, bool MaxVFOnly, BoUpSLP &R) { bool Changed = false; @@ -25479,10 +25479,10 @@ static bool tryToVectorizeSequence( auto *SameTypeIt = IncIt; while (SameTypeIt != E && (!isa(*SameTypeIt) || R.isDeleted(cast(*SameTypeIt)) || - AreCompatible(*SameTypeIt, *IncIt))) { + AreCompatible(VL, *SameTypeIt))) { auto *I = dyn_cast(*SameTypeIt); ++SameTypeIt; - if (I && !R.isDeleted(I)) + if (!I || !R.isDeleted(I)) VL.push_back(cast(I)); } @@ -25677,10 +25677,10 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range CmpInsts, return compareCmp(V, V2, *TLI, *DT); }; - auto AreCompatibleCompares = [&](Value *V1, Value *V2) { - if (V1 == V2) + auto AreCompatibleCompares = [&](ArrayRef VL, Value *V1) { + if (VL.empty() || VL.back() == V1) return true; - return compareCmp(V1, V2, *TLI, *DT); + return compareCmp(V1, VL.back(), *TLI, *DT); }; SmallVector Vals; @@ -25886,9 +25886,11 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { } return false; }; - auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) { - if (V1 == V2) + auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](ArrayRef VL, + Value *V1) { + if (VL.empty() || V1 == VL.back()) return true; + Value *V2 = VL.back(); if (V1->getType() != V2->getType()) return false; ArrayRef Opcodes1 = PHIToOpcodes[V1]; @@ -26259,7 +26261,11 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { V2->getValueOperand()->getValueID(); }; - auto &&AreCompatibleStores = [this](StoreInst *V1, StoreInst *V2) { + auto AreCompatibleStores = [this, &R](ArrayRef VL, + StoreInst *V1) { + if (VL.empty()) + return true; + StoreInst *V2 = VL.back(); if (V1 == V2) return true; if (V1->getValueOperand()->getType() != V2->getValueOperand()->getType()) @@ -26270,15 +26276,24 @@ bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) { if (isa(V1->getValueOperand()) || isa(V2->getValueOperand())) return true; - if (auto *I1 = dyn_cast(V1->getValueOperand())) - if (auto *I2 = dyn_cast(V2->getValueOperand())) { - if (I1->getParent() != I2->getParent()) - return false; - return getSameOpcode({I1, I2}, *TLI).valid(); - } if (isa(V1->getValueOperand()) && isa(V2->getValueOperand())) return true; + auto *I1 = dyn_cast(V1->getValueOperand()); + auto *I2 = dyn_cast(V2->getValueOperand()); + if (I1 || I2) { + if (I1 && I2 && I1->getParent() != I2->getParent()) + return false; + SmallVector NewVL(VL.size() + 1); + for (auto [SI, V] : zip(VL, NewVL)) + V = SI->getValueOperand(); + NewVL.back() = V1->getValueOperand(); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = + Analysis.buildInstructionsState(NewVL, R, VectorizeCopyableElements); + if (S) + return true; + } return V1->getValueOperand()->getValueID() == V2->getValueOperand()->getValueID(); }; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll index 89051c7aba42c..ed0bd3f38ead7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/no_alternate_divrem.ll @@ -4,24 +4,23 @@ define void @test_add_sdiv(ptr %arr1, ptr %arr2, i32 %a0, i32 %a1, i32 %a2, i32 %a3) { ; CHECK-LABEL: @test_add_sdiv( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr i32, ptr [[ARR1:%.*]], i32 2 -; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr i32, ptr [[ARR1]], i32 3 ; CHECK-NEXT: [[GEP2_2:%.*]] = getelementptr i32, ptr [[ARR2:%.*]], i32 2 ; CHECK-NEXT: [[GEP2_3:%.*]] = getelementptr i32, ptr [[ARR2]], i32 3 -; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[GEP1_2]], align 4 -; CHECK-NEXT: [[V3:%.*]] = load i32, ptr [[GEP1_3]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[A0:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[A1:%.*]], i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], +; CHECK-NEXT: [[V2:%.*]] = load i32, ptr [[GEP2_2]], align 4 +; CHECK-NEXT: [[V3:%.*]] = load i32, ptr [[GEP2_3]], align 4 ; CHECK-NEXT: [[Y2:%.*]] = add nsw i32 [[A2:%.*]], 42 -; CHECK-NEXT: [[Y3:%.*]] = add nsw i32 [[A3:%.*]], 0 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARR2]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> , i32 [[A0:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[A3:%.*]], i32 3 +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> , [[TMP3]] ; CHECK-NEXT: [[RES2:%.*]] = sdiv i32 [[V2]], [[Y2]] -; CHECK-NEXT: [[RES3:%.*]] = add nsw i32 [[V3]], [[Y3]] -; CHECK-NEXT: [[TMP4:%.*]] = load <2 x i32>, ptr [[ARR1]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP4]], [[TMP2]] -; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[ARR2]], align 4 -; CHECK-NEXT: store i32 [[RES2]], ptr [[GEP2_2]], align 4 -; CHECK-NEXT: store i32 [[RES3]], ptr [[GEP2_3]], align 4 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[RES2]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[V3]], i32 3 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP4]] +; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr [[ARR3:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll index 4f3d551e21122..75aec45d3e2fc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vect_copyable_in_binops.ll @@ -33,32 +33,12 @@ entry: } define void @add1(ptr noalias %dst, ptr noalias %src) { -; NON-POW2-LABEL: @add1( -; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1 -; NON-POW2-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 -; NON-POW2-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1 -; NON-POW2-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4 -; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x i32>, ptr [[INCDEC_PTR]], align 4 -; NON-POW2-NEXT: [[TMP2:%.*]] = add nsw <3 x i32> [[TMP1]], -; NON-POW2-NEXT: store <3 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 -; NON-POW2-NEXT: ret void -; -; POW2-ONLY-LABEL: @add1( -; POW2-ONLY-NEXT: entry: -; POW2-ONLY-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1 -; POW2-ONLY-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 -; POW2-ONLY-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1 -; POW2-ONLY-NEXT: store i32 [[TMP0]], ptr [[DST]], align 4 -; POW2-ONLY-NEXT: [[INCDEC_PTR5:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3 -; POW2-ONLY-NEXT: [[INCDEC_PTR7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3 -; POW2-ONLY-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR]], align 4 -; POW2-ONLY-NEXT: [[TMP2:%.*]] = add nsw <2 x i32> [[TMP1]], -; POW2-ONLY-NEXT: store <2 x i32> [[TMP2]], ptr [[INCDEC_PTR1]], align 4 -; POW2-ONLY-NEXT: [[TMP3:%.*]] = load i32, ptr [[INCDEC_PTR5]], align 4 -; POW2-ONLY-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP3]], 3 -; POW2-ONLY-NEXT: store i32 [[ADD9]], ptr [[INCDEC_PTR7]], align 4 -; POW2-ONLY-NEXT: ret void +; CHECK-LABEL: @add1( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[TMP0]], +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 +; CHECK-NEXT: ret void ; entry: %incdec.ptr = getelementptr inbounds i32, ptr %src, i64 1 @@ -84,18 +64,9 @@ entry: define void @sub0(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @sub0( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 -; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], ptr [[DST]], align 4 -; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -182,18 +153,9 @@ entry: define void @addsub0(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @addsub0( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 1 -; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[SRC]], align 4 -; CHECK-NEXT: [[SUB:%.*]] = add nsw i32 [[TMP0]], -1 -; CHECK-NEXT: [[INCDEC_PTR1:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 1 -; CHECK-NEXT: store i32 [[SUB]], ptr [[DST]], align 4 -; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 2 -; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4 -; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 2 -; CHECK-NEXT: store i32 [[TMP1]], ptr [[INCDEC_PTR1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: store <2 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: @@ -220,18 +182,9 @@ entry: define void @addsub1(ptr noalias %dst, ptr noalias %src) { ; CHECK-LABEL: @addsub1( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[INCDEC_PTR2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 2 -; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 2 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[SRC]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP0]], -; CHECK-NEXT: store <2 x i32> [[TMP3]], ptr [[DST]], align 4 -; CHECK-NEXT: [[INCDEC_PTR4:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 3 -; CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[INCDEC_PTR2]], align 4 -; CHECK-NEXT: [[INCDEC_PTR6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 3 -; CHECK-NEXT: store i32 [[TMP4]], ptr [[INCDEC_PTR3]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[INCDEC_PTR4]], align 4 -; CHECK-NEXT: [[SUB8:%.*]] = sub nsw i32 [[TMP5]], -3 -; CHECK-NEXT: store i32 [[SUB8]], ptr [[INCDEC_PTR6]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[SRC:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[TMP0]], +; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[DST:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: From f24ae708fcecd04027fae5d3125ffdb1886971b5 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 12 Aug 2025 16:07:15 +0000 Subject: [PATCH 2/2] Address comments Created using spr 1.3.5 --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b605cddc9258b..fcc926f8de238 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -25482,7 +25482,7 @@ static bool tryToVectorizeSequence( AreCompatible(VL, *SameTypeIt))) { auto *I = dyn_cast(*SameTypeIt); ++SameTypeIt; - if (!I || !R.isDeleted(I)) + if (I && !R.isDeleted(I)) VL.push_back(cast(I)); }