[RISCV] Lower unmasked zero-stride vp.stride to a splat of one scalar load. #97394

yetingk · 2024-07-02T08:02:13Z

It's a similar patch as a214c52 for vp.stride.load. Some targets prefer pattern (vmv.v.x (load)) instead of vlse with zero stride.

… load. It's a similar patch as a214c52 for vp.stride.load. Some targets prefer pattern (vmv.v.x (load)) instead of vlse with zero stride.

llvmbot · 2024-07-02T08:02:46Z

@llvm/pr-subscribers-backend-risc-v

Author: Yeting Kuo (yetingk)

Changes

It's a similar patch as a214c52 for vp.stride.load. Some targets prefer pattern (vmv.v.x (load)) instead of vlse with zero stride.

Full diff: https://github.com/llvm/llvm-project/pull/97394.diff

3 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+45-22)
(modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll (+46-4)
(modified) llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll (+44-2)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ee45f730dc450..b792a804bcc5b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -11666,31 +11666,54 @@ SDValue RISCVTargetLowering::lowerVPStridedLoad(SDValue Op,
   auto *VPNode = cast<VPStridedLoadSDNode>(Op);
   // Check if the mask is known to be all ones
   SDValue Mask = VPNode->getMask();
+  SDValue VL = VPNode->getVectorLength();
+  SDValue Stride = VPNode->getStride();
   bool IsUnmasked = ISD::isConstantSplatVectorAllOnes(Mask.getNode());
-
-  SDValue IntID = DAG.getTargetConstant(IsUnmasked ? Intrinsic::riscv_vlse
-                                                   : Intrinsic::riscv_vlse_mask,
-                                        DL, XLenVT);
-  SmallVector<SDValue, 8> Ops{VPNode->getChain(), IntID,
-                              DAG.getUNDEF(ContainerVT), VPNode->getBasePtr(),
-                              VPNode->getStride()};
-  if (!IsUnmasked) {
-    if (VT.isFixedLengthVector()) {
-      MVT MaskVT = ContainerVT.changeVectorElementType(MVT::i1);
-      Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+  SDValue Result, Chain;
+
+  // TODO: We restrict this to unmasked loads currently in consideration of
+  // the complexity of handling all falses masks.
+  MVT ScalarVT = ContainerVT.getVectorElementType();
+  if (IsUnmasked && isNullConstant(Stride) && ContainerVT.isInteger()) {
+    SDValue ScalarLoad =
+        DAG.getExtLoad(ISD::EXTLOAD, DL, XLenVT, VPNode->getChain(),
+                       VPNode->getBasePtr(), ScalarVT, VPNode->getMemOperand());
+    Chain = ScalarLoad.getValue(1);
+    Result = lowerScalarSplat(SDValue(), ScalarLoad, VL, ContainerVT, DL, DAG,
+                              Subtarget);
+  } else if (IsUnmasked && isNullConstant(Stride) && isTypeLegal(ScalarVT)) {
+    SDValue ScalarLoad =
+        DAG.getLoad(ScalarVT, DL, VPNode->getChain(), VPNode->getBasePtr(),
+                    VPNode->getMemOperand());
+    Chain = ScalarLoad.getValue(1);
+    Result = lowerScalarSplat(SDValue(), ScalarLoad, VL, ContainerVT, DL, DAG,
+                              Subtarget);
+  } else {
+    SDValue IntID = DAG.getTargetConstant(
+        IsUnmasked ? Intrinsic::riscv_vlse : Intrinsic::riscv_vlse_mask, DL,
+        XLenVT);
+    SmallVector<SDValue, 8> Ops{VPNode->getChain(), IntID,
+                                DAG.getUNDEF(ContainerVT), VPNode->getBasePtr(),
+                                Stride};
+    if (!IsUnmasked) {
+      if (VT.isFixedLengthVector()) {
+        MVT MaskVT = ContainerVT.changeVectorElementType(MVT::i1);
+        Mask = convertToScalableVector(MaskVT, Mask, DAG, Subtarget);
+      }
+      Ops.push_back(Mask);
+    }
+    Ops.push_back(VL);
+    if (!IsUnmasked) {
+      SDValue Policy =
+          DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT);
+      Ops.push_back(Policy);
     }
-    Ops.push_back(Mask);
-  }
-  Ops.push_back(VPNode->getVectorLength());
-  if (!IsUnmasked) {
-    SDValue Policy = DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT);
-    Ops.push_back(Policy);
-  }
 
-  SDValue Result =
-      DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
-                              VPNode->getMemoryVT(), VPNode->getMemOperand());
-  SDValue Chain = Result.getValue(1);
+    Result =
+        DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops,
+                                VPNode->getMemoryVT(), VPNode->getMemOperand());
+    Chain = Result.getValue(1);
+  }
 
   if (VT.isFixedLengthVector())
     Result = convertFromScalableVector(VT, Result, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
index 5e64e9fbc1a2f..980491a35f0ef 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-strided-vpload.ll
@@ -1,10 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \
-; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV32
+; RUN:     -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     -check-prefixes=CHECK,CHECK-RV32,CHECK-OPT
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \
-; RUN:   -verify-machineinstrs < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-RV64
+; RUN:     -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     -check-prefixes=CHECK,CHECK-RV64,CHECK-OPT
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     -check-prefixes=CHECK,CHECK-RV32,CHECK-NOOPT
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load  \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     -check-prefixes=CHECK,CHECK-RV64,CHECK-NOOPT
 
 declare <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i8(ptr, i8, <2 x i1>, i32)
 
@@ -626,3 +632,39 @@ define <33 x double> @strided_load_v33f64(ptr %ptr, i64 %stride, <33 x i1> %mask
 }
 
 declare <33 x double> @llvm.experimental.vp.strided.load.v33f64.p0.i64(ptr, i64, <33 x i1>, i32)
+
+; Test unmasked integer zero strided
+define <4 x i8> @zero_strided_unmasked_vpload_4i8_i8(ptr %ptr, i32 zeroext %evl) {
+; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
+; CHECK-OPT:       # %bb.0:
+; CHECK-OPT-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-OPT-NEXT:    vlse8.v v8, (a0), zero
+; CHECK-OPT-NEXT:    ret
+;
+; CHECK-NOOPT-LABEL: zero_strided_unmasked_vpload_4i8_i8:
+; CHECK-NOOPT:       # %bb.0:
+; CHECK-NOOPT-NEXT:    lbu a0, 0(a0)
+; CHECK-NOOPT-NEXT:    vsetvli zero, a1, e8, mf4, ta, ma
+; CHECK-NOOPT-NEXT:    vmv.v.x v8, a0
+; CHECK-NOOPT-NEXT:    ret
+  %load = call <4 x i8> @llvm.experimental.vp.strided.load.4i8.p0.i8(ptr %ptr, i8 0, <4 x i1> splat (i1 true), i32 %evl)
+  ret <4 x i8> %load
+}
+
+; Test unmasked float zero strided
+define <4 x half> @zero_strided_unmasked_vpload_4f16(ptr %ptr, i32 zeroext %evl) {
+; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_4f16:
+; CHECK-OPT:       # %bb.0:
+; CHECK-OPT-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-OPT-NEXT:    vlse16.v v8, (a0), zero
+; CHECK-OPT-NEXT:    ret
+;
+; CHECK-NOOPT-LABEL: zero_strided_unmasked_vpload_4f16:
+; CHECK-NOOPT:       # %bb.0:
+; CHECK-NOOPT-NEXT:    flh fa5, 0(a0)
+; CHECK-NOOPT-NEXT:    vsetvli zero, a1, e16, mf2, ta, ma
+; CHECK-NOOPT-NEXT:    vfmv.v.f v8, fa5
+; CHECK-NOOPT-NEXT:    ret
+  %load = call <4 x half> @llvm.experimental.vp.strided.load.4f16.p0.i32(ptr %ptr, i32 0, <4 x i1> splat (i1 true), i32 %evl)
+  ret <4 x half> %load
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
index 4d3bced0bcb50..775c272d21f70 100644
--- a/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/strided-vpload.ll
@@ -1,10 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s \
-; RUN:     -check-prefixes=CHECK,CHECK-RV32
+; RUN:     -check-prefixes=CHECK,CHECK-RV32,CHECK-OPT
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh \
 ; RUN:     -verify-machineinstrs < %s | FileCheck %s \
-; RUN:     -check-prefixes=CHECK,CHECK-RV64
+; RUN:     -check-prefixes=CHECK,CHECK-RV64,CHECK-OPT
+; RUN: llc -mtriple=riscv32 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     -check-prefixes=CHECK,CHECK-RV32,CHECK-NOOPT
+; RUN: llc -mtriple=riscv64 -mattr=+m,+d,+zfh,+v,+zvfh,+no-optimized-zero-stride-load  \
+; RUN:     -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     -check-prefixes=CHECK,CHECK-RV64,CHECK-NOOPT
 
 declare <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr, i8, <vscale x 1 x i1>, i32)
 
@@ -780,3 +786,39 @@ define <vscale x 16 x double> @strided_load_nxv17f64(ptr %ptr, i64 %stride, <vsc
 declare <vscale x 17 x double> @llvm.experimental.vp.strided.load.nxv17f64.p0.i64(ptr, i64, <vscale x 17 x i1>, i32)
 declare <vscale x 1 x double> @llvm.experimental.vector.extract.nxv1f64(<vscale x 17 x double> %vec, i64 %idx)
 declare <vscale x 16 x double> @llvm.experimental.vector.extract.nxv16f64(<vscale x 17 x double> %vec, i64 %idx)
+
+; Test unmasked integer zero strided
+define <vscale x 1 x i8> @zero_strided_unmasked_vpload_nxv1i8_i8(ptr %ptr, i32 zeroext %evl) {
+; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_nxv1i8_i8:
+; CHECK-OPT:       # %bb.0:
+; CHECK-OPT-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-OPT-NEXT:    vlse8.v v8, (a0), zero
+; CHECK-OPT-NEXT:    ret
+;
+; CHECK-NOOPT-LABEL: zero_strided_unmasked_vpload_nxv1i8_i8:
+; CHECK-NOOPT:       # %bb.0:
+; CHECK-NOOPT-NEXT:    lbu a0, 0(a0)
+; CHECK-NOOPT-NEXT:    vsetvli zero, a1, e8, mf8, ta, ma
+; CHECK-NOOPT-NEXT:    vmv.v.x v8, a0
+; CHECK-NOOPT-NEXT:    ret
+  %load = call <vscale x 1 x i8> @llvm.experimental.vp.strided.load.nxv1i8.p0.i8(ptr %ptr, i8 0, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 1 x i8> %load
+}
+
+; Test unmasked float zero strided
+define <vscale x 1 x half> @zero_strided_unmasked_vpload_nxv1f16(ptr %ptr, i32 zeroext %evl) {
+; CHECK-OPT-LABEL: zero_strided_unmasked_vpload_nxv1f16:
+; CHECK-OPT:       # %bb.0:
+; CHECK-OPT-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-OPT-NEXT:    vlse16.v v8, (a0), zero
+; CHECK-OPT-NEXT:    ret
+;
+; CHECK-NOOPT-LABEL: zero_strided_unmasked_vpload_nxv1f16:
+; CHECK-NOOPT:       # %bb.0:
+; CHECK-NOOPT-NEXT:    flh fa5, 0(a0)
+; CHECK-NOOPT-NEXT:    vsetvli zero, a1, e16, mf4, ta, ma
+; CHECK-NOOPT-NEXT:    vfmv.v.f v8, fa5
+; CHECK-NOOPT-NEXT:    ret
+  %load = call <vscale x 1 x half> @llvm.experimental.vp.strided.load.nxv1f16.p0.i32(ptr %ptr, i32 0, <vscale x 1 x i1> splat (i1 true), i32 %evl)
+  ret <vscale x 1 x half> %load
+}

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

lukel97 · 2024-07-02T08:46:21Z

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

+    SDValue ScalarLoad =
+        DAG.getExtLoad(ISD::EXTLOAD, DL, XLenVT, VPNode->getChain(),
+                       VPNode->getBasePtr(), ScalarVT, VPNode->getMemOperand());
+    Chain = ScalarLoad.getValue(1);


I just noticed this is the same code as how Intrinsic::riscv_masked_strided_load is lowered, but I'm wondering in both riscv_masked_strided_load and vp_load should we not be also checking the AVL/EVL is non-zero?

I see the requirement for it being unmasked was discussed here but I can't see the AVL mentioned. Maybe I'm missing something.

I think you are right, we should also check avl. BTW, riscv_masked_strided_load does not have avl operand.

yetingk · 2024-07-02T09:30:21Z

Close the ticket since it's too hard to know evl operand is nonzero in compiler time.

lukel97 · 2024-07-02T09:39:56Z

I guess we could handle the case for constant EVLs. Are they common in practice?

wangpc-pp · 2024-07-02T11:54:00Z

I guess we could handle the case for constant EVLs. Are they common in practice?

I think we can't know it because we don't have any practices since almost no VP intrinsic will be generated currently.

lukel97 · 2024-07-02T12:43:56Z

As an aside, as I understand the riscv_masked_strided_{load,store} intrinsics are mainly used for RISCVGatherScatterLowering. I wonder if it would be worthwhile to teach RISCVGatherScatterLowering to emit llvm.experimental.vp.strided.{load,store} instead so we could remove the riscv_masked_strided intrinsics.

From a quick look, we could emulate the passthru via llvm.vp.merge. But we also need to set the EVL to VLMAX somehow.

wangpc-pp · 2024-07-03T07:39:13Z

As an aside, as I understand the riscv_masked_strided_{load,store} intrinsics are mainly used for RISCVGatherScatterLowering. I wonder if it would be worthwhile to teach RISCVGatherScatterLowering to emit llvm.experimental.vp.strided.{load,store} instead so we could remove the riscv_masked_strided intrinsics.

From a quick look, we could emulate the passthru via llvm.vp.merge. But we also need to set the EVL to VLMAX somehow.

Are you working on this? I can have a try.

lukel97 · 2024-07-03T07:53:17Z

Are you working on this? I can have a try.

No, go ahead! Not sure if this will end up being relevant or not, but I did a really quick experiment earlier today to see if we emit VLMAX for a "full-length" EVL computed via vscale:

define <vscale x 2 x i32> @f(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %mask) {
  %vscale = call i32 @llvm.vscale()
  %vlmax = mul i32 %vscale, 2
  %z = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %mask, i32 %vlmax)
  ret <vscale x 2 x i32> %z
}

But it turns out we don't:

f: 
	csrr	a0, vlenb
	srli	a0, a0, 2
	vsetvli	zero, a0, e32, m1, ta, ma
	vadd.vv	v8, v8, v9, v0.t
	ret

wangpc-pp · 2024-07-03T09:35:20Z

Are you working on this? I can have a try.

No, go ahead! Not sure if this will end up being relevant or not, but I did a really quick experiment earlier today to see if we emit VLMAX for a "full-length" EVL computed via vscale:

We may use -1 to represent VLMAX?

define <vscale x 2 x i32> @f(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %mask) {
  %vscale = call i32 @llvm.vscale()
  %vlmax = mul i32 %vscale, 2
  %z = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %mask, i32 %vlmax)
  ret <vscale x 2 x i32> %z
}

But it turns out we don't:

f: 
	csrr	a0, vlenb
	srli	a0, a0, 2
	vsetvli	zero, a0, e32, m1, ta, ma
	vadd.vv	v8, v8, v9, v0.t
	ret

For now, I think it may not be feasible to use vp.strided.load only, because there is a passthru operand in llvm.masked.gather, which doesn't exist in vp.strided.load.
We need to add a passthru operand to vp.strided.load I think.

lukel97 · 2024-07-03T09:58:48Z

We may use -1 to represent VLMAX?

This doesn't get picked up on RV64 but it does on RV32: I think it's because the EVL argument is 32 bits and we check against a 64 bit sentinel value. Probably something we should fix?

define <vscale x 2 x i32> @f(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %mask) {
  %z = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %mask, i32 -1)
  ret <vscale x 2 x i32> %z
}

f: 
	li	a0, -1
	srli	a0, a0, 32
	vsetvli	zero, a0, e32, m1, ta, ma
	vadd.vv	v8, v8, v9, v0.t
	ret

We need to add a passthru operand to vp.strided.load I think.

We should be able to emulate the passthru with vp.merge:

define <vscale x 2 x i32> @vpmerge_vpload(<vscale x 2 x i32> %passthru, ptr %p, <vscale x 2 x i1> %m, i32 zeroext %vl) {
; CHECK-LABEL: vpmerge_vpload:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, mu
; CHECK-NEXT:    vle32.v v8, (a0), v0.t
; CHECK-NEXT:    ret
  %a = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr %p, <vscale x 2 x i1> splat (i1 -1), i32 %vl)
  %b = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> %m, <vscale x 2 x i32> %a, <vscale x 2 x i32> %passthru, i32 %vl)
  ret <vscale x 2 x i32> %b
}

wangpc-pp · 2024-07-03T10:55:31Z

We may use -1 to represent VLMAX?

This doesn't get picked up on RV64 but it does on RV32: I think it's because the EVL argument is 32 bits and we check against a 64 bit sentinel value. Probably something we should fix?

Agree, I met the same problem in my quick prototype. We need to fix it.

define <vscale x 2 x i32> @f(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %mask) {
  %z = call <vscale x 2 x i32> @llvm.vp.add.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y, <vscale x 2 x i1> %mask, i32 -1)
  ret <vscale x 2 x i32> %z
}

f: 
	li	a0, -1
	srli	a0, a0, 32
	vsetvli	zero, a0, e32, m1, ta, ma
	vadd.vv	v8, v8, v9, v0.t
	ret

We need to add a passthru operand to vp.strided.load I think.

We should be able to emulate the passthru with vp.merge:

define <vscale x 2 x i32> @vpmerge_vpload(<vscale x 2 x i32> %passthru, ptr %p, <vscale x 2 x i1> %m, i32 zeroext %vl) {
; CHECK-LABEL: vpmerge_vpload:
; CHECK:       # %bb.0:
; CHECK-NEXT:    vsetvli zero, a1, e32, m1, tu, mu
; CHECK-NEXT:    vle32.v v8, (a0), v0.t
; CHECK-NEXT:    ret
  %a = call <vscale x 2 x i32> @llvm.vp.load.nxv2i32.p0(ptr %p, <vscale x 2 x i1> splat (i1 -1), i32 %vl)
  %b = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> %m, <vscale x 2 x i32> %a, <vscale x 2 x i32> %passthru, i32 %vl)
  ret <vscale x 2 x i32> %b
}

Yeah! That's feasible!
But is there any reason why vp.strided.load doesn't have a passthru operand? Would it be much more straightforward if we add a passthru operand to it? I don't know the history, but it seems that these intrinsics (gather/scatter vs vp.strided.load/store) are not consistent.

topperc · 2024-07-03T16:02:36Z

We may use -1 to represent VLMAX?

This is undefined behavior for VP intrinsics. The EVL cannot be larger than the runtime vector length.

topperc · 2024-07-03T16:39:18Z

But is there any reason why vp.strided.load doesn't have a passthru operand? Would it be much more straightforward if we add a passthru operand to it? I don't know the history, but it seems that these intrinsics (gather/scatter vs vp.strided.load/store) are not consistent.

None of the VP intrinsics have a passthru operand. The masked.gather/scatter intrinsics are much older and pretty much copied the X86 instruction semantics. I'm not sure vectorizer uses the passthru operand. The IRBuilderBase::CreateMaskedGather considers it an optional parameter and uses poison if its not provided.

lukel97 · 2024-07-04T10:55:24Z

I gave this a try and it is possible to replace the strided intrinsics, PoC here: main...lukel97:llvm-project:remove-riscv-masked-strided-intrinsics

The two main prerequsities were that we need to make RISCVDAGToDAGISel::performCombineVMergeAndVOps slightly smarter (#78565), and detect VLMAX EVL operands when lowering VP nodes to VL nodes

wangpc-pp · 2024-07-04T11:35:28Z

I gave this a try and it is possible to replace the strided intrinsics, PoC here: main...lukel97:llvm-project:remove-riscv-masked-strided-intrinsics

The two main prerequsities were that we need to make RISCVDAGToDAGISel::performCombineVMergeAndVOps slightly smarter (#78565), and detect VLMAX EVL operands when lowering VP nodes to VL nodes

Quick move! 👍
I think you can just create a PR for this. :-)

VSCALE is by definition greater than zero, but this checks it via getVScaleRange anyway. The motivation for this is to be able to check if the EVL for a VP strided load is non-zero in llvm#97394. I added the tests to the RISC-V backend since the existing X86 known-never-zero.ll test crashed when trying to lower vscale for the +sse2 RUN line.

VSCALE is by definition greater than zero, but this checks it via getVScaleRange anyway. The motivation for this is to be able to check if the EVL for a VP strided load is non-zero in #97394. I added the tests to the RISC-V backend since the existing X86 known-never-zero.ll test crashed when trying to lower vscale for the +sse2 RUN line.

This is another version of llvm#97394, but performs it as a DAGCombine instead of lowering so that we have a better chance of detecting non-zero EVLs before they are legalized. The riscv_masked_strided_load already does this, but this combine also checks that the vector element type is legal. Currently a riscv_masked_strided_load with a zero stride of nxv1i64 will crash on rv32, but I'm hoping we can remove the masked_strided intrinsics and replace them with their VP counterparts. RISCVISelDAGToDAG will lower splats of scalar loads back to zero strided loads anyway, so the test changes are to show how combining it to a scalar load can lead to some .vx patterns being matched.

VSCALE is by definition greater than zero, but this checks it via getVScaleRange anyway. The motivation for this is to be able to check if the EVL for a VP strided load is non-zero in llvm#97394. I added the tests to the RISC-V backend since the existing X86 known-never-zero.ll test crashed when trying to lower vscale for the +sse2 RUN line.

[RISCV] Lower unmasked zero-stride vp.stride to a splat of one scalar…

f1a2e97

… load. It's a similar patch as a214c52 for vp.stride.load. Some targets prefer pattern (vmv.v.x (load)) instead of vlse with zero stride.

yetingk requested review from preames, lukel97, topperc and wangpc-pp July 2, 2024 08:02

llvmbot added the backend:RISC-V label Jul 2, 2024

lukel97 reviewed Jul 2, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVISelLowering.cpp Outdated Show resolved Hide resolved

Address comments.

9c6a963

lukel97 reviewed Jul 2, 2024

View reviewed changes

yetingk closed this Jul 2, 2024

lukel97 mentioned this pull request Jul 5, 2024

[SelectionDAG] Handle VSCALE in isKnownNeverZero #97789

Merged

lukel97 mentioned this pull request Jul 5, 2024

[RISCV] Combine vp_strided_load with zero stride to scalar load + splat #97798

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[RISCV] Lower unmasked zero-stride vp.stride to a splat of one scalar load. #97394

[RISCV] Lower unmasked zero-stride vp.stride to a splat of one scalar load. #97394

yetingk commented Jul 2, 2024

llvmbot commented Jul 2, 2024

lukel97 Jul 2, 2024

yetingk Jul 2, 2024

yetingk commented Jul 2, 2024

lukel97 commented Jul 2, 2024

wangpc-pp commented Jul 2, 2024

lukel97 commented Jul 2, 2024 •

edited

Loading

wangpc-pp commented Jul 3, 2024

lukel97 commented Jul 3, 2024

wangpc-pp commented Jul 3, 2024

lukel97 commented Jul 3, 2024

wangpc-pp commented Jul 3, 2024

topperc commented Jul 3, 2024

topperc commented Jul 3, 2024

lukel97 commented Jul 4, 2024

wangpc-pp commented Jul 4, 2024

[RISCV] Lower unmasked zero-stride vp.stride to a splat of one scalar load. #97394

[RISCV] Lower unmasked zero-stride vp.stride to a splat of one scalar load. #97394

Conversation

yetingk commented Jul 2, 2024

llvmbot commented Jul 2, 2024

lukel97 Jul 2, 2024

Choose a reason for hiding this comment

yetingk Jul 2, 2024

Choose a reason for hiding this comment

yetingk commented Jul 2, 2024

lukel97 commented Jul 2, 2024

wangpc-pp commented Jul 2, 2024

lukel97 commented Jul 2, 2024 • edited Loading

wangpc-pp commented Jul 3, 2024

lukel97 commented Jul 3, 2024

wangpc-pp commented Jul 3, 2024

lukel97 commented Jul 3, 2024

wangpc-pp commented Jul 3, 2024

topperc commented Jul 3, 2024

topperc commented Jul 3, 2024

lukel97 commented Jul 4, 2024

wangpc-pp commented Jul 4, 2024

lukel97 commented Jul 2, 2024 •

edited

Loading