-
Notifications
You must be signed in to change notification settings - Fork 11.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Expand vp.stride.load to splat of a scalar load. #98140
Conversation
It's a similar patch as a214c52 for vp.stride.load. Some targets prefer pattern (vmv.v.x (load)) instead of vlse with zero stride.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
@llvm/pr-subscribers-backend-risc-v Author: Yeting Kuo (yetingk) ChangesIt's a similar patch as a214c52 for vp.stride.load. Some targets prefer pattern (vmv.v.x (load)) instead of vlse with zero stride. It's IR version of #97798. Full diff: https://github.com/llvm/llvm-project/pull/98140.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index 6e0f429c34b2f..10e0496f16d4f 100644
--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -18,9 +18,11 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
@@ -35,6 +37,7 @@ namespace {
class RISCVCodeGenPrepare : public FunctionPass,
public InstVisitor<RISCVCodeGenPrepare, bool> {
const DataLayout *DL;
+ const DominatorTree *DT;
const RISCVSubtarget *ST;
public:
@@ -48,12 +51,14 @@ class RISCVCodeGenPrepare : public FunctionPass,
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetPassConfig>();
}
bool visitInstruction(Instruction &I) { return false; }
bool visitAnd(BinaryOperator &BO);
bool visitIntrinsicInst(IntrinsicInst &I);
+ bool expandVPStrideLoad(IntrinsicInst &I);
};
} // end anonymous namespace
@@ -128,6 +133,9 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
// Which eliminates the scalar -> vector -> scalar crossing during instruction
// selection.
bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
+ if (expandVPStrideLoad(I))
+ return true;
+
if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd)
return false;
@@ -155,6 +163,47 @@ bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
return true;
}
+bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) {
+ if (ST->hasOptimizedZeroStrideLoad())
+ return false;
+
+ Value *BasePtr, *VL;
+ using namespace PatternMatch;
+ if (!match(&II, m_Intrinsic<Intrinsic::experimental_vp_strided_load>(
+ m_Value(BasePtr), m_Zero(), m_AllOnes(), m_Value(VL))))
+ return false;
+
+ if (!isKnownNonZero(VL, {*DL, DT, nullptr, &II}))
+ return false;
+
+ auto *VTy = cast<VectorType>(II.getType());
+
+ IRBuilder<> Builder(&II);
+
+ // Extend VL from i32 to XLen if needed.
+ if (ST->is64Bit())
+ VL = Builder.CreateZExt(VL, Builder.getInt64Ty());
+
+ Type *STy = VTy->getElementType();
+ Value *Val = Builder.CreateLoad(STy, BasePtr);
+ const auto &TLI = *ST->getTargetLowering();
+ Value *Res;
+
+ // TODO: Also support fixed/illegal vector types to splat with evl = vl.
+ if (isa<ScalableVectorType>(VTy) && TLI.isTypeLegal(EVT::getEVT(VTy))) {
+ unsigned VMVOp = STy->isFloatingPointTy() ? Intrinsic::riscv_vfmv_v_f
+ : Intrinsic::riscv_vmv_v_x;
+ Res = Builder.CreateIntrinsic(VMVOp, {VTy, VL->getType()},
+ {PoisonValue::get(VTy), Val, VL});
+ } else {
+ Res = Builder.CreateVectorSplat(VTy->getElementCount(), Val);
+ }
+
+ II.replaceAllUsesWith(Res);
+ II.eraseFromParent();
+ return true;
+}
+
bool RISCVCodeGenPrepare::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
@@ -164,6 +213,7 @@ bool RISCVCodeGenPrepare::runOnFunction(Function &F) {
ST = &TM.getSubtarget<RISCVSubtarget>(F);
DL = &F.getDataLayout();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
bool MadeChange = false;
for (auto &BB : F)
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index 5e64e9fbc1a2f..86359043a90d9 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -1,10 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \
-; RUN: -verify-machineinstrs < %s \
-; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: -check-prefixes=CHECK,CHECK-RV32
; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \
-; RUN: -verify-machineinstrs < %s \
-; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: -check-prefixes=CHECK,CHECK-RV64,CHECK-OPT
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: -check-prefixes=CHECK,CHECK-RV32,CHECK-NOOPT
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: -check-prefixes=CHECK,CHECK-RV64,CHECK-NOOPT
declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i8(ptr, i8, <2 x i1>, i32)
@@ -626,3 +632,39 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
}
declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64, <33 x i1>, i32)
+
+; Test unmasked integer zero strided
+define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) {
+; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
+; CHECK-OPT: # %bb.0:
+; CHECK-OPT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-OPT-NEXT: vlse8.v v8, (a0), zero
+; CHECK-OPT-NEXT: ret
+;
+; CHECK-NOOPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
+; CHECK-NOOPT: # %bb.0:
+; CHECK-NOOPT-NEXT: lbu a0, 0(a0)
+; CHECK-NOOPT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NOOPT-NEXT: vmv.v.x v8, a0
+; CHECK-NOOPT-NEXT: ret
+ %load = call <4 x i8> @llvm.experimental.vp.strided.load.4i8.p0.i8(ptr %ptr, i8 0, <4 x i1> splat (i1 true), i32 4)
+ ret <4 x i8> %load
+}
+
+; Test unmasked float zero strided
+define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) {
+; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
+; CHECK-OPT: # %bb.0:
+; CHECK-OPT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-OPT-NEXT: vlse16.v v8, (a0), zero
+; CHECK-OPT-NEXT: ret
+;
+; CHECK-NOOPT-LABEL: zero_strided_unmasked_vpload_4f16:
+; CHECK-NOOPT: # %bb.0:
+; CHECK-NOOPT-NEXT: flh fa5, 0(a0)
+; CHECK-NOOPT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NOOPT-NEXT: vfmv.v.f v8, fa5
+; CHECK-NOOPT-NEXT: ret
+ %load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 4)
+ ret <4 x half> %load
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
index 4d3bced0bcb50..d422ed5dcfc22 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
@@ -1,10 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \
; RUN: -verify-machineinstrs < %s | FileCheck %s \
-; RUN: -check-prefixes=CHECK,CHECK-RV32
+; RUN: -check-prefixes=CHECK,CHECK-RV32,CHECK-OPT
; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \
; RUN: -verify-machineinstrs < %s | FileCheck %s \
-; RUN: -check-prefixes=CHECK,CHECK-RV64
+; RUN: -check-prefixes=CHECK,CHECK-RV64,CHECK-OPT
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: -check-prefixes=CHECK,CHECK-RV32,CHECK-NOOPT
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load \
+; RUN: -verify-machineinstrs < %s | FileCheck %s \
+; RUN: -check-prefixes=CHECK,CHECK-RV64,CHECK-NOOPT
declare <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr, i8, <vscale x 1 x i1>, i32)
@@ -780,3 +786,39 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
declare <vscale x 17 x double> @llvm.experimental.vp.strided.load.nxv17f64.p0.i64(ptr, i64, <vscale x 17 x i1>, i32)
declare <vscale x 1 x double> @llvm.experimental.vector.extract.nxv1f64(<vscale x 17 x double> %vec, i64 %idx)
declare <vscale x 16 x double> @llvm.experimental.vector.extract.nxv16f64(<vscale x 17 x double> %vec, i64 %idx)
+
+; Test unmasked integer zero strided
+define <vscale x 1 x i8> @zero_strided_unmasked_vpload_nxv1i8_i8(ptr %ptr) {
+; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_nxv1i8_i8:
+; CHECK-OPT: # %bb.0:
+; CHECK-OPT-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
+; CHECK-OPT-NEXT: vlse8.v v8, (a0), zero
+; CHECK-OPT-NEXT: ret
+;
+; CHECK-NOOPT-LABEL: zero_strided_unmasked_vpload_nxv1i8_i8:
+; CHECK-NOOPT: # %bb.0:
+; CHECK-NOOPT-NEXT: lbu a0, 0(a0)
+; CHECK-NOOPT-NEXT: vsetivli zero, 4, e8, mf8, ta, ma
+; CHECK-NOOPT-NEXT: vmv.v.x v8, a0
+; CHECK-NOOPT-NEXT: ret
+ %load = call <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr %ptr, i8 0, <vscale x 1 x i1> splat (i1 true), i32 4)
+ ret <vscale x 1 x i8> %load
+}
+
+; Test unmasked float zero strided
+define <vscale x 1 x half> @zero_strided_unmasked_vpload_nxv1f16(ptr %ptr) {
+; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_nxv1f16:
+; CHECK-OPT: # %bb.0:
+; CHECK-OPT-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
+; CHECK-OPT-NEXT: vlse16.v v8, (a0), zero
+; CHECK-OPT-NEXT: ret
+;
+; CHECK-NOOPT-LABEL: zero_strided_unmasked_vpload_nxv1f16:
+; CHECK-NOOPT: # %bb.0:
+; CHECK-NOOPT-NEXT: flh fa5, 0(a0)
+; CHECK-NOOPT-NEXT: vsetivli zero, 4, e16, mf4, ta, ma
+; CHECK-NOOPT-NEXT: vfmv.v.f v8, fa5
+; CHECK-NOOPT-NEXT: ret
+ %load = call <vscale x 1 x half> @llvm.experimental.vp.strided.load.nxv1f16.p0.i32(ptr %ptr, i32 0, <vscale x 1 x i1> splat (i1 true), i32 4)
+ ret <vscale x 1 x half> %load
+}
|
I plan to wait new comments of this PR for around 12 hours. If no another comments appeared, I will merge it. |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/185/builds/1537 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/137/builds/1548 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/56/builds/2089 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/1547 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/175/builds/1538 Here is the relevant piece of the build log for the reference:
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/60/builds/2190 Here is the relevant piece of the build log for the reference:
|
Woops, I think this needs rebased after #98205. I wish we had a merge queue to catch these types of things |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/422 Here is the relevant piece of the build log for the reference:
|
This is recommit of llvm#98140. It should be based on llvm#98205 which changes the feature of hardware zero stride optimization. It's a similar patch as a214c52 for vp.stride.load. Some targets prefer pattern (vmv.v.x (load)) instead of vlse with zero stride.
This is recommit of llvm#98140. It should be based on llvm#98205 which changes the feature of hardware zero stride optimization. It's a similar patch as a214c52 for vp.stride.load. Some targets prefer pattern (vmv.v.x (load)) instead of vlse with zero stride.
It's a similar patch as a214c52 for vp.stride.load. Some targets prefer pattern (vmv.v.x (load)) instead of vlse with zero stride. It's IR version of llvm#97798.
…vm#98422) Reverts llvm#98140 Breaks tests, see comments on the PR.
It's a similar patch as a214c52 for vp.stride.load. Some targets prefer pattern (vmv.v.x (load)) instead of vlse with zero stride.
It's IR version of #97798.