From 26ac382feb0030266da42ed2e4dbb6614cecace1 Mon Sep 17 00:00:00 2001 From: Yeting Kuo Date: Mon, 1 Jul 2024 21:08:26 -0700 Subject: [PATCH 1/3] [RISCV] Expand vp.stride.load to splat of a scalar load. It's a similar patch as a214c521f8763b36dd400b89017f74ad5ae4b6c7 for vp.stride.load. Some targets prefer pattern (vmv.v.x (load)) instead of vlse with zero stride. --- llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 46 +++++++++++++++++++ .../RISCV/rvv/fixed-vectors-strided-vpload.ll | 38 +++++++++++++-- llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll | 46 ++++++++++++++++++- 3 files changed, 124 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp index 6e0f429c34b2f6..60ae0d49bd020b 100644 --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -18,9 +18,11 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstVisitor.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsRISCV.h" #include "llvm/IR/PatternMatch.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" @@ -35,6 +37,7 @@ namespace { class RISCVCodeGenPrepare : public FunctionPass, public InstVisitor { const DataLayout *DL; + const DominatorTree *DT; const RISCVSubtarget *ST; public: @@ -48,12 +51,14 @@ class RISCVCodeGenPrepare : public FunctionPass, void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired(); AU.addRequired(); } bool visitInstruction(Instruction &I) { return false; } bool visitAnd(BinaryOperator &BO); bool visitIntrinsicInst(IntrinsicInst &I); + bool expandVPStrideLoad(IntrinsicInst &I); }; } // end anonymous namespace @@ -128,6 +133,9 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) { // Which eliminates the scalar -> vector -> scalar crossing during instruction // selection. bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { + if (expandVPStrideLoad(I)) + return true; + if (I.getIntrinsicID() != Intrinsic::vector_reduce_fadd) return false; @@ -155,6 +163,43 @@ bool RISCVCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) { return true; } +bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) { + if (ST->hasOptimizedZeroStrideLoad()) + return false; + + Value *BasePtr, *VL; + using namespace PatternMatch; + if (!match(&II, m_Intrinsic( + m_Value(BasePtr), m_Zero(), m_AllOnes(), m_Value(VL)))) + return false; + + if (!isKnownNonZero(VL, {*DL, DT, nullptr, &II})) + return false; + + auto *VTy = cast(II.getType()); + + // FIXME: Support fixed vector types. + if (!isa(VTy)) + return false; + + IRBuilder<> Builder(&II); + + // Extend VL from i32 to XLen if needed. + if (ST->is64Bit()) + VL = Builder.CreateZExt(VL, Builder.getInt64Ty()); + + Type *STy = VTy->getElementType(); + Value *Val = Builder.CreateLoad(STy, BasePtr); + unsigned VMVOp = STy->isFloatingPointTy() ? Intrinsic::riscv_vfmv_v_f + : Intrinsic::riscv_vmv_v_x; + Value *Res = Builder.CreateIntrinsic(VMVOp, {VTy, VL->getType()}, + {PoisonValue::get(VTy), Val, VL}); + + II.replaceAllUsesWith(Res); + II.eraseFromParent(); + return true; +} + bool RISCVCodeGenPrepare::runOnFunction(Function &F) { if (skipFunction(F)) return false; @@ -164,6 +209,7 @@ bool RISCVCodeGenPrepare::runOnFunction(Function &F) { ST = &TM.getSubtarget(F); DL = &F.getDataLayout(); + DT = &getAnalysis().getDomTree(); bool MadeChange = false; for (auto &BB : F) diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll index 5e64e9fbc1a2f5..50e8e34cee2a1a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll @@ -1,10 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \ -; RUN: -verify-machineinstrs < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV32 +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: -check-prefixes=CHECK,CHECK-RV32 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \ -; RUN: -verify-machineinstrs < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-RV64 +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: -check-prefixes=CHECK,CHECK-RV64,CHECK-OPT +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: -check-prefixes=CHECK,CHECK-RV32 +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: -check-prefixes=CHECK,CHECK-RV64 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i8(ptr, i8, <2 x i1>, i32) @@ -626,3 +632,27 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask } declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64, <33 x i1>, i32) + +; Test unmasked integer zero strided +define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) { +; CHECK-LABEL: zero_strided_unmasked_vpload_4i8_i8: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NEXT: vlse8.v v8, (a0), zero +; CHECK-NEXT: ret + %load = call <4 x i8> @llvm.experimental.vp.strided.load.4i8.p0.i8(ptr %ptr, i8 0, <4 x i1> splat (i1 true), i32 4) + ret <4 x i8> %load +} + +; Test unmasked float zero strided +define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) { +; CHECK-LABEL: zero_strided_unmasked_vpload_4f16: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NEXT: vlse16.v v8, (a0), zero +; CHECK-NEXT: ret + %load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 4) + ret <4 x half> %load +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-OPT: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll index 4d3bced0bcb50f..d422ed5dcfc223 100644 --- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll @@ -1,10 +1,16 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \ ; RUN: -verify-machineinstrs < %s | FileCheck %s \ -; RUN: -check-prefixes=CHECK,CHECK-RV32 +; RUN: -check-prefixes=CHECK,CHECK-RV32,CHECK-OPT ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \ ; RUN: -verify-machineinstrs < %s | FileCheck %s \ -; RUN: -check-prefixes=CHECK,CHECK-RV64 +; RUN: -check-prefixes=CHECK,CHECK-RV64,CHECK-OPT +; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: -check-prefixes=CHECK,CHECK-RV32,CHECK-NOOPT +; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load \ +; RUN: -verify-machineinstrs < %s | FileCheck %s \ +; RUN: -check-prefixes=CHECK,CHECK-RV64,CHECK-NOOPT declare @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr, i8, , i32) @@ -780,3 +786,39 @@ define @strided_load_nxv17f64(ptr %ptr, i64 %stride, @llvm.experimental.vp.strided.load.nxv17f64.p0.i64(ptr, i64, , i32) declare @llvm.experimental.vector.extract.nxv1f64( %vec, i64 %idx) declare @llvm.experimental.vector.extract.nxv16f64( %vec, i64 %idx) + +; Test unmasked integer zero strided +define @zero_strided_unmasked_vpload_nxv1i8_i8(ptr %ptr) { +; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_nxv1i8_i8: +; CHECK-OPT: # %bb.0: +; CHECK-OPT-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; CHECK-OPT-NEXT: vlse8.v v8, (a0), zero +; CHECK-OPT-NEXT: ret +; +; CHECK-NOOPT-LABEL: zero_strided_unmasked_vpload_nxv1i8_i8: +; CHECK-NOOPT: # %bb.0: +; CHECK-NOOPT-NEXT: lbu a0, 0(a0) +; CHECK-NOOPT-NEXT: vsetivli zero, 4, e8, mf8, ta, ma +; CHECK-NOOPT-NEXT: vmv.v.x v8, a0 +; CHECK-NOOPT-NEXT: ret + %load = call @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr %ptr, i8 0, splat (i1 true), i32 4) + ret %load +} + +; Test unmasked float zero strided +define @zero_strided_unmasked_vpload_nxv1f16(ptr %ptr) { +; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_nxv1f16: +; CHECK-OPT: # %bb.0: +; CHECK-OPT-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; CHECK-OPT-NEXT: vlse16.v v8, (a0), zero +; CHECK-OPT-NEXT: ret +; +; CHECK-NOOPT-LABEL: zero_strided_unmasked_vpload_nxv1f16: +; CHECK-NOOPT: # %bb.0: +; CHECK-NOOPT-NEXT: flh fa5, 0(a0) +; CHECK-NOOPT-NEXT: vsetivli zero, 4, e16, mf4, ta, ma +; CHECK-NOOPT-NEXT: vfmv.v.f v8, fa5 +; CHECK-NOOPT-NEXT: ret + %load = call @llvm.experimental.vp.strided.load.nxv1f16.p0.i32(ptr %ptr, i32 0, splat (i1 true), i32 4) + ret %load +} From 8fdd86bece66ebe8bc5abb7df921dfb0152a652f Mon Sep 17 00:00:00 2001 From: Yeting Kuo Date: Tue, 9 Jul 2024 20:31:39 -0700 Subject: [PATCH 2/3] Use CreateVectorSplat for fixed vector. --- llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 18 +++++---- .../RISCV/rvv/fixed-vectors-strided-vpload.ll | 40 ++++++++++++------- 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp index 60ae0d49bd020b..a2200e2d8642d0 100644 --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -178,10 +178,6 @@ bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) { auto *VTy = cast(II.getType()); - // FIXME: Support fixed vector types. - if (!isa(VTy)) - return false; - IRBuilder<> Builder(&II); // Extend VL from i32 to XLen if needed. @@ -190,10 +186,16 @@ bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) { Type *STy = VTy->getElementType(); Value *Val = Builder.CreateLoad(STy, BasePtr); - unsigned VMVOp = STy->isFloatingPointTy() ? Intrinsic::riscv_vfmv_v_f - : Intrinsic::riscv_vmv_v_x; - Value *Res = Builder.CreateIntrinsic(VMVOp, {VTy, VL->getType()}, - {PoisonValue::get(VTy), Val, VL}); + const auto &TLI = *ST->getTargetLowering(); + Value *Res = Builder.CreateVectorSplat(VTy->getElementCount(), Val); + + // TODO: Also support fixed/illegal vector types to splat with evl = vl. + if (isa(VTy) && TLI.isTypeLegal(EVT::getEVT(VTy))) { + unsigned VMVOp = STy->isFloatingPointTy() ? Intrinsic::riscv_vfmv_v_f + : Intrinsic::riscv_vmv_v_x; + Res = Builder.CreateIntrinsic(VMVOp, {VTy, VL->getType()}, + {PoisonValue::get(VTy), Val, VL}); + } II.replaceAllUsesWith(Res); II.eraseFromParent(); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll index 50e8e34cee2a1a..86359043a90d94 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll @@ -7,10 +7,10 @@ ; RUN: -check-prefixes=CHECK,CHECK-RV64,CHECK-OPT ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load \ ; RUN: -verify-machineinstrs < %s | FileCheck %s \ -; RUN: -check-prefixes=CHECK,CHECK-RV32 +; RUN: -check-prefixes=CHECK,CHECK-RV32,CHECK-NOOPT ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load \ ; RUN: -verify-machineinstrs < %s | FileCheck %s \ -; RUN: -check-prefixes=CHECK,CHECK-RV64 +; RUN: -check-prefixes=CHECK,CHECK-RV64,CHECK-NOOPT declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i8(ptr, i8, <2 x i1>, i32) @@ -635,24 +635,36 @@ declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64, ; Test unmasked integer zero strided define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr) { -; CHECK-LABEL: zero_strided_unmasked_vpload_4i8_i8: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; CHECK-NEXT: vlse8.v v8, (a0), zero -; CHECK-NEXT: ret +; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8: +; CHECK-OPT: # %bb.0: +; CHECK-OPT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-OPT-NEXT: vlse8.v v8, (a0), zero +; CHECK-OPT-NEXT: ret +; +; CHECK-NOOPT-LABEL: zero_strided_unmasked_vpload_4i8_i8: +; CHECK-NOOPT: # %bb.0: +; CHECK-NOOPT-NEXT: lbu a0, 0(a0) +; CHECK-NOOPT-NEXT: vsetivli zero, 4, e8, mf4, ta, ma +; CHECK-NOOPT-NEXT: vmv.v.x v8, a0 +; CHECK-NOOPT-NEXT: ret %load = call <4 x i8> @llvm.experimental.vp.strided.load.4i8.p0.i8(ptr %ptr, i8 0, <4 x i1> splat (i1 true), i32 4) ret <4 x i8> %load } ; Test unmasked float zero strided define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr) { -; CHECK-LABEL: zero_strided_unmasked_vpload_4f16: -; CHECK: # %bb.0: -; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma -; CHECK-NEXT: vlse16.v v8, (a0), zero -; CHECK-NEXT: ret +; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4f16: +; CHECK-OPT: # %bb.0: +; CHECK-OPT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-OPT-NEXT: vlse16.v v8, (a0), zero +; CHECK-OPT-NEXT: ret +; +; CHECK-NOOPT-LABEL: zero_strided_unmasked_vpload_4f16: +; CHECK-NOOPT: # %bb.0: +; CHECK-NOOPT-NEXT: flh fa5, 0(a0) +; CHECK-NOOPT-NEXT: vsetivli zero, 4, e16, mf2, ta, ma +; CHECK-NOOPT-NEXT: vfmv.v.f v8, fa5 +; CHECK-NOOPT-NEXT: ret %load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 4) ret <4 x half> %load } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-OPT: {{.*}} From 6a8613c759b0d65348d40283868639e71faffa03 Mon Sep 17 00:00:00 2001 From: Yeting Kuo Date: Tue, 9 Jul 2024 20:47:29 -0700 Subject: [PATCH 3/3] fixup! Address Luke's comment. --- llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp index a2200e2d8642d0..10e0496f16d4f1 100644 --- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp +++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp @@ -187,7 +187,7 @@ bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) { Type *STy = VTy->getElementType(); Value *Val = Builder.CreateLoad(STy, BasePtr); const auto &TLI = *ST->getTargetLowering(); - Value *Res = Builder.CreateVectorSplat(VTy->getElementCount(), Val); + Value *Res; // TODO: Also support fixed/illegal vector types to splat with evl = vl. if (isa(VTy) && TLI.isTypeLegal(EVT::getEVT(VTy))) { @@ -195,6 +195,8 @@ bool RISCVCodeGenPrepare::expandVPStrideLoad(IntrinsicInst &II) { : Intrinsic::riscv_vmv_v_x; Res = Builder.CreateIntrinsic(VMVOp, {VTy, VL->getType()}, {PoisonValue::get(VTy), Val, VL}); + } else { + Res = Builder.CreateVectorSplat(VTy->getElementCount(), Val); } II.replaceAllUsesWith(Res);