From fea72a0fff1bfc0f0a950facf0e7e5404fedc0c0 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Tue, 29 Oct 2024 10:05:07 +0000 Subject: [PATCH 1/4] [AArch64][SVE2] Lower read-after-write mask to whilerw This patch extends the whilewr matching to also match a read-after-write mask and lower it to a whilerw. --- .../Target/AArch64/AArch64ISelLowering.cpp | 34 ++- llvm/test/CodeGen/AArch64/whilewr.ll | 261 +++++++++++++----- 2 files changed, 223 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index bf2f0674b5b65..a2517761afc0c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14189,7 +14189,16 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, return SDValue(); SDValue Diff = Cmp.getOperand(0); - if (Diff.getOpcode() != ISD::SUB || Diff.getValueType() != MVT::i64) + SDValue NonAbsDiff = Diff; + bool WriteAfterRead = true; + // A read-after-write will have an abs call on the diff + if (Diff.getOpcode() == ISD::ABS) { + NonAbsDiff = Diff.getOperand(0); + WriteAfterRead = false; + } + + if (NonAbsDiff.getOpcode() != ISD::SUB || + NonAbsDiff.getValueType() != MVT::i64) return SDValue(); if (!isNullConstant(LaneMask.getOperand(1)) || @@ -14210,8 +14219,13 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, // it's positive, otherwise the difference plus the element size if it's // negative: pos_diff = diff < 0 ? (diff + 7) : diff SDValue Select = DiffDiv.getOperand(0); + SDValue SelectOp3 = Select.getOperand(3); + // Check for an abs in the case of a read-after-write + if (!WriteAfterRead && SelectOp3.getOpcode() == ISD::ABS) + SelectOp3 = SelectOp3.getOperand(0); + // Make sure the difference is being compared by the select - if (Select.getOpcode() != ISD::SELECT_CC || Select.getOperand(3) != Diff) + if (Select.getOpcode() != ISD::SELECT_CC || SelectOp3 != NonAbsDiff) return SDValue(); // Make sure it's checking if the difference is less than 0 if (!isNullConstant(Select.getOperand(1)) || @@ -14243,22 +14257,26 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, } else if (LaneMask.getOperand(2) != Diff) return SDValue(); - SDValue StorePtr = Diff.getOperand(0); - SDValue ReadPtr = Diff.getOperand(1); + SDValue StorePtr = NonAbsDiff.getOperand(0); + SDValue ReadPtr = NonAbsDiff.getOperand(1); unsigned IntrinsicID = 0; switch (EltSize) { case 1: - IntrinsicID = Intrinsic::aarch64_sve_whilewr_b; + IntrinsicID = WriteAfterRead ? Intrinsic::aarch64_sve_whilewr_b + : Intrinsic::aarch64_sve_whilerw_b; break; case 2: - IntrinsicID = Intrinsic::aarch64_sve_whilewr_h; + IntrinsicID = WriteAfterRead ? Intrinsic::aarch64_sve_whilewr_h + : Intrinsic::aarch64_sve_whilerw_h; break; case 4: - IntrinsicID = Intrinsic::aarch64_sve_whilewr_s; + IntrinsicID = WriteAfterRead ? Intrinsic::aarch64_sve_whilewr_s + : Intrinsic::aarch64_sve_whilerw_s; break; case 8: - IntrinsicID = Intrinsic::aarch64_sve_whilewr_d; + IntrinsicID = WriteAfterRead ? Intrinsic::aarch64_sve_whilewr_d + : Intrinsic::aarch64_sve_whilerw_d; break; default: return SDValue(); diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll index 9f1ea85079238..ec59e42feb6a4 100644 --- a/llvm/test/CodeGen/AArch64/whilewr.ll +++ b/llvm/test/CodeGen/AArch64/whilewr.ll @@ -30,6 +30,36 @@ entry: ret %active.lane.mask.alias } +define @whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilerw_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.b, x2, x1 +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilerw_8: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: subs x8, x2, x1 +; CHECK-NOSVE2-NEXT: cneg x8, x8, mi +; CHECK-NOSVE2-NEXT: cmp x8, #0 +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8 +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b24 = ptrtoint ptr %b to i64 + %c25 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %c25, %b24 + %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false) + %neg.compare = icmp slt i64 %0, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %0) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + define @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_commutative: ; CHECK: // %bb.0: // %entry @@ -89,6 +119,39 @@ entry: ret %active.lane.mask.alias } +define @whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilerw_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.h, x2, x1 +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilerw_16: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: subs x8, x2, x1 +; CHECK-NOSVE2-NEXT: cneg x8, x8, mi +; CHECK-NOSVE2-NEXT: cmn x8, #1 +; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NOSVE2-NEXT: asr x8, x8, #1 +; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9 +; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8 +; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b24 = ptrtoint ptr %b to i64 + %c25 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %c25, %b24 + %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false) + %diff = sdiv i64 %0, 2 + %neg.compare = icmp slt i64 %0, -1 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + define @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_32: ; CHECK: // %bb.0: // %entry @@ -122,6 +185,41 @@ entry: ret %active.lane.mask.alias } +define @whilerw_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilerw_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.s, x2, x1 +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilerw_32: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: subs x8, x2, x1 +; CHECK-NOSVE2-NEXT: cneg x8, x8, mi +; CHECK-NOSVE2-NEXT: add x9, x8, #3 +; CHECK-NOSVE2-NEXT: cmp x8, #0 +; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt +; CHECK-NOSVE2-NEXT: cmn x8, #3 +; CHECK-NOSVE2-NEXT: cset w8, lt +; CHECK-NOSVE2-NEXT: asr x9, x9, #2 +; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9 +; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b24 = ptrtoint ptr %b to i64 + %c25 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %c25, %b24 + %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false) + %diff = sdiv i64 %0, 4 + %neg.compare = icmp slt i64 %0, -3 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + define @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_64: ; CHECK: // %bb.0: // %entry @@ -155,6 +253,41 @@ entry: ret %active.lane.mask.alias } +define @whilerw_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: whilerw_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilerw p0.d, x2, x1 +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: whilerw_64: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: subs x8, x2, x1 +; CHECK-NOSVE2-NEXT: cneg x8, x8, mi +; CHECK-NOSVE2-NEXT: add x9, x8, #7 +; CHECK-NOSVE2-NEXT: cmp x8, #0 +; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt +; CHECK-NOSVE2-NEXT: cmn x8, #7 +; CHECK-NOSVE2-NEXT: cset w8, lt +; CHECK-NOSVE2-NEXT: asr x9, x9, #3 +; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9 +; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b24 = ptrtoint ptr %b to i64 + %c25 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %c25, %b24 + %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false) + %diff = sdiv i64 %0, 8 + %neg.compare = icmp slt i64 %0, -7 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + define @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: no_whilewr_128: ; CHECK: // %bb.0: // %entry @@ -212,7 +345,7 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB6_3 +; CHECK-NEXT: b.lt .LBB10_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: whilewr p0.b, x1, x2 ; CHECK-NEXT: mov w9, w3 @@ -220,7 +353,7 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: whilelo p1.b, xzr, x9 ; CHECK-NEXT: cntp x10, p0, p0.b ; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB6_2: // %vector.body +; CHECK-NEXT: .LBB10_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] @@ -229,14 +362,14 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] ; CHECK-NEXT: add x8, x8, x10 ; CHECK-NEXT: whilelo p1.b, x8, x9 -; CHECK-NEXT: b.mi .LBB6_2 -; CHECK-NEXT: .LBB6_3: // %for.cond.cleanup +; CHECK-NEXT: b.mi .LBB10_2 +; CHECK-NEXT: .LBB10_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_8: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB6_3 +; CHECK-NOSVE2-NEXT: b.lt .LBB10_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: sub x9, x1, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr @@ -250,7 +383,7 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 ; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b ; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB6_2: // %vector.body +; CHECK-NOSVE2-NEXT: .LBB10_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] @@ -259,8 +392,8 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB6_2 -; CHECK-NOSVE2-NEXT: .LBB6_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: b.mi .LBB10_2 +; CHECK-NOSVE2-NEXT: .LBB10_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp11 = icmp sgt i32 %n, 0 @@ -306,14 +439,14 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB7_3 +; CHECK-NEXT: b.lt .LBB11_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w8, w3 ; CHECK-NEXT: whilewr p1.h, x1, x2 ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: whilelo p0.h, xzr, x8 ; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NEXT: .LBB7_2: // %vector.body +; CHECK-NEXT: .LBB11_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] @@ -321,14 +454,14 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: st1h { z0.h }, p0, [x2, x9, lsl #1] ; CHECK-NEXT: inch x9 ; CHECK-NEXT: whilelo p0.h, x9, x8 -; CHECK-NEXT: b.mi .LBB7_2 -; CHECK-NEXT: .LBB7_3: // %for.cond.cleanup +; CHECK-NEXT: b.mi .LBB11_2 +; CHECK-NEXT: .LBB11_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_16: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB7_3 +; CHECK-NOSVE2-NEXT: b.lt .LBB11_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: mov w9, w3 ; CHECK-NOSVE2-NEXT: sub x10, x1, x2 @@ -344,7 +477,7 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: cnth x10 ; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b ; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: .LBB7_2: // %vector.body +; CHECK-NOSVE2-NEXT: .LBB11_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] ; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] @@ -352,8 +485,8 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p0.h, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB7_2 -; CHECK-NOSVE2-NEXT: .LBB7_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: b.mi .LBB11_2 +; CHECK-NOSVE2-NEXT: .LBB11_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp11 = icmp sgt i32 %n, 0 @@ -399,14 +532,14 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB8_3 +; CHECK-NEXT: b.lt .LBB12_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w8, w3 ; CHECK-NEXT: whilewr p1.s, x1, x2 ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: whilelo p0.s, xzr, x8 ; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NEXT: .LBB8_2: // %vector.body +; CHECK-NEXT: .LBB12_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] @@ -414,14 +547,14 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: st1w { z0.s }, p0, [x2, x9, lsl #2] ; CHECK-NEXT: incw x9 ; CHECK-NEXT: whilelo p0.s, x9, x8 -; CHECK-NEXT: b.mi .LBB8_2 -; CHECK-NEXT: .LBB8_3: // %for.cond.cleanup +; CHECK-NEXT: b.mi .LBB12_2 +; CHECK-NEXT: .LBB12_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_32: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB8_3 +; CHECK-NOSVE2-NEXT: b.lt .LBB12_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: mov w9, w3 ; CHECK-NOSVE2-NEXT: sub x10, x1, x2 @@ -439,7 +572,7 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: cntw x10 ; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b ; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: .LBB8_2: // %vector.body +; CHECK-NOSVE2-NEXT: .LBB12_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] ; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] @@ -447,8 +580,8 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p0.s, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB8_2 -; CHECK-NOSVE2-NEXT: .LBB8_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: b.mi .LBB12_2 +; CHECK-NOSVE2-NEXT: .LBB12_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp9 = icmp sgt i32 %n, 0 @@ -494,14 +627,14 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB9_3 +; CHECK-NEXT: b.lt .LBB13_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w8, w3 ; CHECK-NEXT: whilewr p1.d, x1, x2 ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: whilelo p0.d, xzr, x8 ; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NEXT: .LBB9_2: // %vector.body +; CHECK-NEXT: .LBB13_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] @@ -509,14 +642,14 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: st1d { z0.d }, p0, [x2, x9, lsl #3] ; CHECK-NEXT: incd x9 ; CHECK-NEXT: whilelo p0.d, x9, x8 -; CHECK-NEXT: b.mi .LBB9_2 -; CHECK-NEXT: .LBB9_3: // %for.cond.cleanup +; CHECK-NEXT: b.mi .LBB13_2 +; CHECK-NEXT: .LBB13_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_64: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB9_3 +; CHECK-NOSVE2-NEXT: b.lt .LBB13_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: mov w9, w3 ; CHECK-NOSVE2-NEXT: sub x10, x1, x2 @@ -534,7 +667,7 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: cntd x10 ; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b ; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b -; CHECK-NOSVE2-NEXT: .LBB9_2: // %vector.body +; CHECK-NOSVE2-NEXT: .LBB13_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] ; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] @@ -542,8 +675,8 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p0.d, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB9_2 -; CHECK-NOSVE2-NEXT: .LBB9_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: b.mi .LBB13_2 +; CHECK-NOSVE2-NEXT: .LBB13_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp9 = icmp sgt i32 %n, 0 @@ -589,7 +722,7 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_multiple_8: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB10_3 +; CHECK-NEXT: b.lt .LBB14_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: whilewr p0.b, x0, x2 ; CHECK-NEXT: mov w9, w3 @@ -599,7 +732,7 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: whilelo p1.b, xzr, x9 ; CHECK-NEXT: cntp x10, p0, p0.b ; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB10_2: // %vector.body +; CHECK-NEXT: .LBB14_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] @@ -608,14 +741,14 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] ; CHECK-NEXT: add x8, x8, x10 ; CHECK-NEXT: whilelo p1.b, x8, x9 -; CHECK-NEXT: b.mi .LBB10_2 -; CHECK-NEXT: .LBB10_3: // %for.cond.cleanup +; CHECK-NEXT: b.mi .LBB14_2 +; CHECK-NEXT: .LBB14_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_8: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB10_3 +; CHECK-NOSVE2-NEXT: b.lt .LBB14_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: sub x9, x0, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr @@ -637,7 +770,7 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x9 ; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.b ; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB10_2: // %vector.body +; CHECK-NOSVE2-NEXT: .LBB14_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] @@ -646,8 +779,8 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: st1b { z0.b }, p1, [x2, x8] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p1.b, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB10_2 -; CHECK-NOSVE2-NEXT: .LBB10_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: b.mi .LBB14_2 +; CHECK-NOSVE2-NEXT: .LBB14_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp11 = icmp sgt i32 %n, 0 @@ -701,7 +834,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_multiple_16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB11_3 +; CHECK-NEXT: b.lt .LBB15_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: whilewr p0.h, x0, x2 ; CHECK-NEXT: mov w9, w3 @@ -711,7 +844,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: whilelo p1.h, xzr, x9 ; CHECK-NEXT: cntp x10, p0, p0.h ; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB11_2: // %vector.body +; CHECK-NEXT: .LBB15_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] @@ -720,14 +853,14 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] ; CHECK-NEXT: add x8, x8, x10 ; CHECK-NEXT: whilelo p1.h, x8, x9 -; CHECK-NEXT: b.mi .LBB11_2 -; CHECK-NEXT: .LBB11_3: // %for.cond.cleanup +; CHECK-NEXT: b.mi .LBB15_2 +; CHECK-NEXT: .LBB15_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_16: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB11_3 +; CHECK-NOSVE2-NEXT: b.lt .LBB15_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: sub x9, x0, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr @@ -753,7 +886,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9 ; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.h ; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB11_2: // %vector.body +; CHECK-NOSVE2-NEXT: .LBB15_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] @@ -762,8 +895,8 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p1.h, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB11_2 -; CHECK-NOSVE2-NEXT: .LBB11_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: b.mi .LBB15_2 +; CHECK-NOSVE2-NEXT: .LBB15_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp11 = icmp sgt i32 %n, 0 @@ -819,7 +952,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_multiple_32: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB12_3 +; CHECK-NEXT: b.lt .LBB16_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: whilewr p0.s, x0, x2 ; CHECK-NEXT: mov w9, w3 @@ -829,7 +962,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: whilelo p1.s, xzr, x9 ; CHECK-NEXT: cntp x10, p0, p0.s ; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB12_2: // %vector.body +; CHECK-NEXT: .LBB16_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] @@ -838,14 +971,14 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] ; CHECK-NEXT: add x8, x8, x10 ; CHECK-NEXT: whilelo p1.s, x8, x9 -; CHECK-NEXT: b.mi .LBB12_2 -; CHECK-NEXT: .LBB12_3: // %for.cond.cleanup +; CHECK-NEXT: b.mi .LBB16_2 +; CHECK-NEXT: .LBB16_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_32: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB12_3 +; CHECK-NOSVE2-NEXT: b.lt .LBB16_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: sub x9, x0, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr @@ -875,7 +1008,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9 ; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.s ; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB12_2: // %vector.body +; CHECK-NOSVE2-NEXT: .LBB16_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] @@ -884,8 +1017,8 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p1.s, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB12_2 -; CHECK-NOSVE2-NEXT: .LBB12_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: b.mi .LBB16_2 +; CHECK-NOSVE2-NEXT: .LBB16_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp9 = icmp sgt i32 %n, 0 @@ -941,7 +1074,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_loop_multiple_64: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB13_3 +; CHECK-NEXT: b.lt .LBB17_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: whilewr p0.d, x0, x2 ; CHECK-NEXT: mov w9, w3 @@ -951,7 +1084,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: whilelo p1.d, xzr, x9 ; CHECK-NEXT: cntp x10, p0, p0.d ; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB13_2: // %vector.body +; CHECK-NEXT: .LBB17_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] @@ -960,14 +1093,14 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] ; CHECK-NEXT: add x8, x8, x10 ; CHECK-NEXT: whilelo p1.d, x8, x9 -; CHECK-NEXT: b.mi .LBB13_2 -; CHECK-NEXT: .LBB13_3: // %for.cond.cleanup +; CHECK-NEXT: b.mi .LBB17_2 +; CHECK-NEXT: .LBB17_3: // %for.cond.cleanup ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_64: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 -; CHECK-NOSVE2-NEXT: b.lt .LBB13_3 +; CHECK-NOSVE2-NEXT: b.lt .LBB17_3 ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: sub x9, x0, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr @@ -997,7 +1130,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9 ; CHECK-NOSVE2-NEXT: cntp x10, p0, p0.d ; CHECK-NOSVE2-NEXT: and x10, x10, #0xff -; CHECK-NOSVE2-NEXT: .LBB13_2: // %vector.body +; CHECK-NOSVE2-NEXT: .LBB17_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NOSVE2-NEXT: and p1.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] @@ -1006,8 +1139,8 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 ; CHECK-NOSVE2-NEXT: whilelo p1.d, x8, x9 -; CHECK-NOSVE2-NEXT: b.mi .LBB13_2 -; CHECK-NOSVE2-NEXT: .LBB13_3: // %for.cond.cleanup +; CHECK-NOSVE2-NEXT: b.mi .LBB17_2 +; CHECK-NOSVE2-NEXT: .LBB17_3: // %for.cond.cleanup ; CHECK-NOSVE2-NEXT: ret entry: %cmp9 = icmp sgt i32 %n, 0 From c131430b06e289a57098cd01884c3b72b7ebb1f3 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 5 Nov 2024 15:16:56 +0000 Subject: [PATCH 2/4] Fix comparisons in tests --- .../Target/AArch64/AArch64ISelLowering.cpp | 155 ++++++------ llvm/test/CodeGen/AArch64/whilewr.ll | 226 +++++++++--------- 2 files changed, 199 insertions(+), 182 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a2517761afc0c..edf86fd7f806f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14153,11 +14153,13 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) { return ResultSLI; } -/// Try to lower the construction of a pointer alias mask to a WHILEWR. -/// The mask's enabled lanes represent the elements that will not overlap across -/// one loop iteration. This tries to match: -/// or (splat (setcc_lt (sub ptrA, ptrB), -(element_size - 1))), +/// Try to lower the construction of a pointer alias mask to a WHILEWR or +/// WHILERW. The mask's enabled lanes represent the elements that will not +/// overlap across one loop iteration. This tries to match: +/// or (splat (setcc_lt/lte/eq (sub ptrA, ptrB), 0)), /// (get_active_lane_mask 0, (div (sub ptrA, ptrB), element_size)) +/// A call to abs on the subtraction signifies that it's a read-after-write and +/// hence a WHILERW. SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget &Subtarget) { if (!Subtarget.hasSVE2()) @@ -14170,6 +14172,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, if (LaneMask.getOpcode() != ISD::INTRINSIC_WO_CHAIN || LaneMask.getConstantOperandVal(0) != Intrinsic::get_active_lane_mask || + !isNullConstant(LaneMask.getOperand(1)) || Splat.getOpcode() != ISD::SPLAT_VECTOR) return SDValue(); @@ -14177,16 +14180,17 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, if (Cmp.getOpcode() != ISD::SETCC) return SDValue(); - CondCodeSDNode *Cond = cast(Cmp.getOperand(2)); - - auto ComparatorConst = dyn_cast(Cmp.getOperand(1)); - if (!ComparatorConst || ComparatorConst->getSExtValue() > 0 || - Cond->get() != ISD::CondCode::SETLT) - return SDValue(); - unsigned CompValue = std::abs(ComparatorConst->getSExtValue()); - unsigned EltSize = CompValue + 1; - if (!isPowerOf2_64(EltSize) || EltSize > 8) - return SDValue(); + // The number of elements that alias is calculated by dividing the positive + // difference between the pointers by the element size. An alias mask for i8 + // elements omits the division because it would just divide by 1 + SDValue DiffDiv = LaneMask.getOperand(2); + unsigned EltSize = 1; + if (DiffDiv.getOpcode() == ISD::SRA) { + auto DiffDivConst = dyn_cast(DiffDiv.getOperand(1)); + if (!DiffDivConst) + return SDValue(); + EltSize = 1 << DiffDivConst->getZExtValue(); + } SDValue Diff = Cmp.getOperand(0); SDValue NonAbsDiff = Diff; @@ -14197,66 +14201,81 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, WriteAfterRead = false; } - if (NonAbsDiff.getOpcode() != ISD::SUB || - NonAbsDiff.getValueType() != MVT::i64) + ISD::CondCode Cond = cast(Cmp.getOperand(2))->get(); + auto ComparatorConst = dyn_cast(Cmp.getOperand(1)); + if (!ComparatorConst) return SDValue(); - if (!isNullConstant(LaneMask.getOperand(1)) || - (EltSize != 1 && LaneMask.getOperand(2).getOpcode() != ISD::SRA)) + // The diff should be compared to 0. A write-after-read should be less than or + // equal and a read-after-write should be equal. + int CompValue = ComparatorConst->getSExtValue(); + switch (CompValue) { + case 0: + if (WriteAfterRead && Cond != ISD::CondCode::SETLE) + return SDValue(); + else if (!WriteAfterRead && Cond != ISD::CondCode::SETEQ) + return SDValue(); + break; + case 1: + if (!WriteAfterRead) + return SDValue(); + if (Cond != ISD::CondCode::SETLT) + return SDValue(); + break; + default: return SDValue(); + } - // The number of elements that alias is calculated by dividing the positive - // difference between the pointers by the element size. An alias mask for i8 - // elements omits the division because it would just divide by 1 - if (EltSize > 1) { - SDValue DiffDiv = LaneMask.getOperand(2); - auto DiffDivConst = dyn_cast(DiffDiv.getOperand(1)); - if (!DiffDivConst || DiffDivConst->getZExtValue() != Log2_64(EltSize)) - return SDValue(); - if (EltSize > 2) { - // When masking i32 or i64 elements, the positive value of the - // possibly-negative difference comes from a select of the difference if - // it's positive, otherwise the difference plus the element size if it's - // negative: pos_diff = diff < 0 ? (diff + 7) : diff - SDValue Select = DiffDiv.getOperand(0); - SDValue SelectOp3 = Select.getOperand(3); - // Check for an abs in the case of a read-after-write - if (!WriteAfterRead && SelectOp3.getOpcode() == ISD::ABS) - SelectOp3 = SelectOp3.getOperand(0); - - // Make sure the difference is being compared by the select - if (Select.getOpcode() != ISD::SELECT_CC || SelectOp3 != NonAbsDiff) - return SDValue(); - // Make sure it's checking if the difference is less than 0 - if (!isNullConstant(Select.getOperand(1)) || - cast(Select.getOperand(4))->get() != - ISD::CondCode::SETLT) - return SDValue(); - // An add creates a positive value from the negative difference - SDValue Add = Select.getOperand(2); - if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff) - return SDValue(); - if (auto *AddConst = dyn_cast(Add.getOperand(1)); - !AddConst || AddConst->getZExtValue() != EltSize - 1) - return SDValue(); - } else { - // When masking i16 elements, this positive value comes from adding the - // difference's sign bit to the difference itself. This is equivalent to - // the 32 bit and 64 bit case: pos_diff = diff + sign_bit (diff) - SDValue Add = DiffDiv.getOperand(0); - if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff) - return SDValue(); - // A logical right shift by 63 extracts the sign bit from the difference - SDValue Shift = Add.getOperand(1); - if (Shift.getOpcode() != ISD::SRL || Shift.getOperand(0) != Diff) - return SDValue(); - if (auto *ShiftConst = dyn_cast(Shift.getOperand(1)); - !ShiftConst || ShiftConst->getZExtValue() != 63) - return SDValue(); - } - } else if (LaneMask.getOperand(2) != Diff) + if (NonAbsDiff.getOpcode() != ISD::SUB || + NonAbsDiff.getValueType() != MVT::i64) return SDValue(); + if (EltSize == 1) { + // When the element size is 1, the division is omitted, so the lane mask + // just uses the raw difference between the pointers. + if (LaneMask.getOperand(2) != Diff) + return SDValue(); + } else if (EltSize == 2) { + // When masking i16 elements, this positive value comes from adding the + // difference's sign bit to the difference itself. This is equivalent to + // the 32 bit and 64 bit case: pos_diff = diff + sign_bit (diff) + SDValue Add = DiffDiv.getOperand(0); + if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff) + return SDValue(); + // A logical right shift by 63 extracts the sign bit from the difference + SDValue Shift = Add.getOperand(1); + if (Shift.getOpcode() != ISD::SRL || Shift.getOperand(0) != Diff) + return SDValue(); + if (auto *ShiftConst = dyn_cast(Shift.getOperand(1)); + !ShiftConst || ShiftConst->getZExtValue() != 63) + return SDValue(); + } else if (EltSize > 2) { + // When masking i32 or i64 elements, the positive value of the + // possibly-negative difference comes from a select of the difference if + // it's positive, otherwise the difference plus the element size if it's + // negative: pos_diff = diff < 0 ? (diff + 7) : diff + SDValue Select = DiffDiv.getOperand(0); + SDValue SelectOp3 = Select.getOperand(3); + // Check for an abs in the case of a read-after-write + if (!WriteAfterRead && SelectOp3.getOpcode() == ISD::ABS) + SelectOp3 = SelectOp3.getOperand(0); + + // Make sure the difference is being compared by the select + if (Select.getOpcode() != ISD::SELECT_CC || SelectOp3 != NonAbsDiff) + return SDValue(); + // Make sure it's checking if the difference is less than 0 + if (!isNullConstant(Select.getOperand(1)) || + cast(Select.getOperand(4))->get() != + ISD::CondCode::SETLT) + return SDValue(); + // An add creates a positive value from the negative difference + SDValue Add = Select.getOperand(2); + if (Add.getOpcode() != ISD::ADD || Add.getOperand(0) != Diff) + return SDValue(); + if (auto *AddConst = dyn_cast(Add.getOperand(1)); + !AddConst || AddConst->getZExtValue() != EltSize - 1) + return SDValue(); + } SDValue StorePtr = NonAbsDiff.getOperand(0); SDValue ReadPtr = NonAbsDiff.getOperand(1); diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll index ec59e42feb6a4..0bdb4b726731f 100644 --- a/llvm/test/CodeGen/AArch64/whilewr.ll +++ b/llvm/test/CodeGen/AArch64/whilewr.ll @@ -11,7 +11,7 @@ define @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-LABEL: whilewr_8: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 -; CHECK-NOSVE2-NEXT: cmp x8, #0 +; CHECK-NOSVE2-NEXT: cmp x8, #1 ; CHECK-NOSVE2-NEXT: cset w9, lt ; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8 ; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 @@ -22,7 +22,7 @@ entry: %c14 = ptrtoint ptr %c to i64 %b15 = ptrtoint ptr %b to i64 %sub.diff = sub i64 %b15, %c14 - %neg.compare = icmp slt i64 %sub.diff, 0 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) @@ -33,15 +33,15 @@ entry: define @whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilerw_8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilerw p0.b, x2, x1 +; CHECK-NEXT: whilerw p0.b, x1, x2 ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilerw_8: ; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: subs x8, x2, x1 +; CHECK-NOSVE2-NEXT: subs x8, x1, x2 ; CHECK-NOSVE2-NEXT: cneg x8, x8, mi ; CHECK-NOSVE2-NEXT: cmp x8, #0 -; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: cset w9, eq ; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8 ; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x8 @@ -50,9 +50,9 @@ define @whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { entry: %b24 = ptrtoint ptr %b to i64 %c25 = ptrtoint ptr %c to i64 - %sub.diff = sub i64 %c25, %b24 + %sub.diff = sub i64 %b24, %c25 %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false) - %neg.compare = icmp slt i64 %0, 0 + %neg.compare = icmp eq i64 %0, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %0) @@ -69,7 +69,7 @@ define @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i ; CHECK-NOSVE2-LABEL: whilewr_commutative: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 -; CHECK-NOSVE2-NEXT: cmp x8, #0 +; CHECK-NOSVE2-NEXT: cmp x8, #1 ; CHECK-NOSVE2-NEXT: cset w9, lt ; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x8 ; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 @@ -80,7 +80,7 @@ entry: %c14 = ptrtoint ptr %c to i64 %b15 = ptrtoint ptr %b to i64 %sub.diff = sub i64 %b15, %c14 - %neg.compare = icmp slt i64 %sub.diff, 0 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) @@ -97,7 +97,7 @@ define @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-LABEL: whilewr_16: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 -; CHECK-NOSVE2-NEXT: cmn x8, #1 +; CHECK-NOSVE2-NEXT: cmp x8, #1 ; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63 ; CHECK-NOSVE2-NEXT: cset w9, lt ; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 @@ -111,7 +111,7 @@ entry: %c15 = ptrtoint ptr %c to i64 %sub.diff = sub i64 %b14, %c15 %diff = sdiv i64 %sub.diff, 2 - %neg.compare = icmp slt i64 %sub.diff, -1 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) @@ -122,16 +122,16 @@ entry: define @whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilerw_16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilerw p0.h, x2, x1 +; CHECK-NEXT: whilerw p0.h, x1, x2 ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilerw_16: ; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: subs x8, x2, x1 +; CHECK-NOSVE2-NEXT: subs x8, x1, x2 ; CHECK-NOSVE2-NEXT: cneg x8, x8, mi -; CHECK-NOSVE2-NEXT: cmn x8, #1 +; CHECK-NOSVE2-NEXT: cmp x8, #0 ; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63 -; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: cset w9, eq ; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 ; CHECK-NOSVE2-NEXT: asr x8, x8, #1 ; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9 @@ -141,10 +141,10 @@ define @whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { entry: %b24 = ptrtoint ptr %b to i64 %c25 = ptrtoint ptr %c to i64 - %sub.diff = sub i64 %c25, %b24 + %sub.diff = sub i64 %b24, %c25 %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false) %diff = sdiv i64 %0, 2 - %neg.compare = icmp slt i64 %0, -1 + %neg.compare = icmp eq i64 %0, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) @@ -164,7 +164,7 @@ define @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: add x9, x8, #3 ; CHECK-NOSVE2-NEXT: cmp x8, #0 ; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt -; CHECK-NOSVE2-NEXT: cmn x8, #3 +; CHECK-NOSVE2-NEXT: cmp x8, #1 ; CHECK-NOSVE2-NEXT: cset w8, lt ; CHECK-NOSVE2-NEXT: asr x9, x9, #2 ; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 @@ -177,7 +177,7 @@ entry: %c13 = ptrtoint ptr %c to i64 %sub.diff = sub i64 %b12, %c13 %diff = sdiv i64 %sub.diff, 4 - %neg.compare = icmp slt i64 %sub.diff, -3 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) @@ -188,31 +188,30 @@ entry: define @whilerw_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilerw_32: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilerw p0.s, x2, x1 +; CHECK-NEXT: whilerw p0.s, x1, x2 ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilerw_32: ; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: subs x8, x2, x1 +; CHECK-NOSVE2-NEXT: subs x8, x1, x2 ; CHECK-NOSVE2-NEXT: cneg x8, x8, mi -; CHECK-NOSVE2-NEXT: add x9, x8, #3 ; CHECK-NOSVE2-NEXT: cmp x8, #0 -; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt -; CHECK-NOSVE2-NEXT: cmn x8, #3 -; CHECK-NOSVE2-NEXT: cset w8, lt -; CHECK-NOSVE2-NEXT: asr x9, x9, #2 -; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x9 -; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NOSVE2-NEXT: add x9, x8, #3 +; CHECK-NOSVE2-NEXT: cset w10, eq +; CHECK-NOSVE2-NEXT: csel x8, x9, x8, lt +; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1 +; CHECK-NOSVE2-NEXT: asr x8, x8, #2 +; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 +; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x8 ; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b ; CHECK-NOSVE2-NEXT: ret entry: %b24 = ptrtoint ptr %b to i64 %c25 = ptrtoint ptr %c to i64 - %sub.diff = sub i64 %c25, %b24 + %sub.diff = sub i64 %b24, %c25 %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false) %diff = sdiv i64 %0, 4 - %neg.compare = icmp slt i64 %0, -3 + %neg.compare = icmp eq i64 %0, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) @@ -232,7 +231,7 @@ define @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: add x9, x8, #7 ; CHECK-NOSVE2-NEXT: cmp x8, #0 ; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt -; CHECK-NOSVE2-NEXT: cmn x8, #7 +; CHECK-NOSVE2-NEXT: cmp x8, #1 ; CHECK-NOSVE2-NEXT: cset w8, lt ; CHECK-NOSVE2-NEXT: asr x9, x9, #3 ; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 @@ -245,7 +244,7 @@ entry: %c13 = ptrtoint ptr %c to i64 %sub.diff = sub i64 %b12, %c13 %diff = sdiv i64 %sub.diff, 8 - %neg.compare = icmp slt i64 %sub.diff, -7 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) @@ -256,31 +255,30 @@ entry: define @whilerw_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilerw_64: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilerw p0.d, x2, x1 +; CHECK-NEXT: whilerw p0.d, x1, x2 ; CHECK-NEXT: ret ; ; CHECK-NOSVE2-LABEL: whilerw_64: ; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: subs x8, x2, x1 +; CHECK-NOSVE2-NEXT: subs x8, x1, x2 ; CHECK-NOSVE2-NEXT: cneg x8, x8, mi -; CHECK-NOSVE2-NEXT: add x9, x8, #7 ; CHECK-NOSVE2-NEXT: cmp x8, #0 -; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt -; CHECK-NOSVE2-NEXT: cmn x8, #7 -; CHECK-NOSVE2-NEXT: cset w8, lt -; CHECK-NOSVE2-NEXT: asr x9, x9, #3 -; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x9 -; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NOSVE2-NEXT: add x9, x8, #7 +; CHECK-NOSVE2-NEXT: cset w10, eq +; CHECK-NOSVE2-NEXT: csel x8, x9, x8, lt +; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1 +; CHECK-NOSVE2-NEXT: asr x8, x8, #3 +; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8 ; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b ; CHECK-NOSVE2-NEXT: ret entry: %b24 = ptrtoint ptr %b to i64 %c25 = ptrtoint ptr %c to i64 - %sub.diff = sub i64 %c25, %b24 + %sub.diff = sub i64 %b24, %c25 %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false) %diff = sdiv i64 %0, 8 - %neg.compare = icmp slt i64 %0, -7 + %neg.compare = icmp eq i64 %0, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) @@ -297,7 +295,7 @@ define @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) ; CHECK-NEXT: add x9, x8, #15 ; CHECK-NEXT: cmp x8, #0 ; CHECK-NEXT: csel x9, x9, x8, lt -; CHECK-NEXT: cmn x8, #15 +; CHECK-NEXT: cmp x8, #1 ; CHECK-NEXT: asr x9, x9, #4 ; CHECK-NEXT: cset w8, lt ; CHECK-NEXT: sbfx x8, x8, #0, #1 @@ -317,7 +315,7 @@ define @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) ; CHECK-NOSVE2-NEXT: add x9, x8, #15 ; CHECK-NOSVE2-NEXT: cmp x8, #0 ; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt -; CHECK-NOSVE2-NEXT: cmn x8, #15 +; CHECK-NOSVE2-NEXT: cmp x8, #1 ; CHECK-NOSVE2-NEXT: asr x9, x9, #4 ; CHECK-NOSVE2-NEXT: cset w8, lt ; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 @@ -333,7 +331,7 @@ entry: %c13 = ptrtoint ptr %c to i64 %sub.diff = sub i64 %b12, %c13 %diff = sdiv i64 %sub.diff, 16 - %neg.compare = icmp slt i64 %sub.diff, -15 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 %diff) @@ -373,7 +371,7 @@ define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: sub x9, x1, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: cmp x9, #1 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9 ; CHECK-NOSVE2-NEXT: sbfx x9, x10, #0, #1 @@ -404,7 +402,7 @@ for.body.preheader: %b15 = ptrtoint ptr %b to i64 %wide.trip.count = zext nneg i32 %n to i64 %sub.diff = sub i64 %b15, %c14 - %neg.compare = icmp slt i64 %sub.diff, 0 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) @@ -442,18 +440,18 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: b.lt .LBB11_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: whilewr p1.h, x1, x2 +; CHECK-NEXT: whilewr p0.h, x1, x2 ; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: whilelo p0.h, xzr, x8 -; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: whilelo p1.h, xzr, x8 ; CHECK-NEXT: .LBB11_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0, x9, lsl #1] -; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1, x9, lsl #1] +; CHECK-NEXT: and p2.b, p0/z, p0.b, p1.b +; CHECK-NEXT: ld1h { z0.h }, p2/z, [x0, x9, lsl #1] +; CHECK-NEXT: ld1h { z1.h }, p2/z, [x1, x9, lsl #1] ; CHECK-NEXT: add z0.h, z1.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p0, [x2, x9, lsl #1] +; CHECK-NEXT: st1h { z0.h }, p1, [x2, x9, lsl #1] ; CHECK-NEXT: inch x9 -; CHECK-NEXT: whilelo p0.h, x9, x8 +; CHECK-NEXT: whilelo p1.h, x9, x8 ; CHECK-NEXT: b.mi .LBB11_2 ; CHECK-NEXT: .LBB11_3: // %for.cond.cleanup ; CHECK-NEXT: ret @@ -467,7 +465,7 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: sub x10, x1, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr ; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x9 -; CHECK-NOSVE2-NEXT: cmn x10, #1 +; CHECK-NOSVE2-NEXT: cmp x10, #1 ; CHECK-NOSVE2-NEXT: add x10, x10, x10, lsr #63 ; CHECK-NOSVE2-NEXT: cset w11, lt ; CHECK-NOSVE2-NEXT: sbfx x11, x11, #0, #1 @@ -476,11 +474,11 @@ define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: whilelo p2.h, xzr, x10 ; CHECK-NOSVE2-NEXT: cnth x10 ; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: .LBB11_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] -; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] +; CHECK-NOSVE2-NEXT: and p2.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: ld1h { z0.h }, p2/z, [x0, x8, lsl #1] +; CHECK-NOSVE2-NEXT: ld1h { z1.h }, p2/z, [x1, x8, lsl #1] ; CHECK-NOSVE2-NEXT: add z0.h, z1.h, z0.h ; CHECK-NOSVE2-NEXT: st1h { z0.h }, p0, [x2, x8, lsl #1] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 @@ -501,21 +499,21 @@ for.body.preheader: %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count) %sub.diff = sub i64 %b14, %c15 %diff = sdiv i64 %sub.diff, 2 - %neg.compare = icmp slt i64 %sub.diff, -1 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat - %2 = and %active.lane.mask.alias, %active.lane.mask.entry br label %vector.body vector.body: %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %2 = and %active.lane.mask.alias, %active.lane.mask %3 = getelementptr inbounds i16, ptr %a, i64 %index - %wide.masked.load = tail call @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, %active.lane.mask, poison) + %wide.masked.load = tail call @llvm.masked.load.nxv8i16.p0(ptr %3, i32 2, %2, poison) %4 = getelementptr inbounds i16, ptr %b, i64 %index - %wide.masked.load16 = tail call @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, %active.lane.mask, poison) + %wide.masked.load16 = tail call @llvm.masked.load.nxv8i16.p0(ptr %4, i32 2, %2, poison) %5 = add %wide.masked.load16, %wide.masked.load %6 = getelementptr inbounds i16, ptr %c, i64 %index tail call void @llvm.masked.store.nxv8i16.p0( %5, ptr %6, i32 2, %active.lane.mask) @@ -535,18 +533,18 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: b.lt .LBB12_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: whilewr p1.s, x1, x2 +; CHECK-NEXT: whilewr p0.s, x1, x2 ; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: whilelo p0.s, xzr, x8 -; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: whilelo p1.s, xzr, x8 ; CHECK-NEXT: .LBB12_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0, x9, lsl #2] -; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1, x9, lsl #2] +; CHECK-NEXT: and p2.b, p0/z, p0.b, p1.b +; CHECK-NEXT: ld1w { z0.s }, p2/z, [x0, x9, lsl #2] +; CHECK-NEXT: ld1w { z1.s }, p2/z, [x1, x9, lsl #2] ; CHECK-NEXT: add z0.s, z1.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p0, [x2, x9, lsl #2] +; CHECK-NEXT: st1w { z0.s }, p1, [x2, x9, lsl #2] ; CHECK-NEXT: incw x9 -; CHECK-NEXT: whilelo p0.s, x9, x8 +; CHECK-NEXT: whilelo p1.s, x9, x8 ; CHECK-NEXT: b.mi .LBB12_2 ; CHECK-NEXT: .LBB12_3: // %for.cond.cleanup ; CHECK-NEXT: ret @@ -563,7 +561,7 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: add x11, x10, #3 ; CHECK-NOSVE2-NEXT: cmp x10, #0 ; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt -; CHECK-NOSVE2-NEXT: cmn x10, #3 +; CHECK-NOSVE2-NEXT: cmp x10, #1 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: asr x11, x11, #2 ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 @@ -571,11 +569,11 @@ define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x10 ; CHECK-NOSVE2-NEXT: cntw x10 ; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: .LBB12_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] -; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] +; CHECK-NOSVE2-NEXT: and p2.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: ld1w { z0.s }, p2/z, [x0, x8, lsl #2] +; CHECK-NOSVE2-NEXT: ld1w { z1.s }, p2/z, [x1, x8, lsl #2] ; CHECK-NOSVE2-NEXT: add z0.s, z1.s, z0.s ; CHECK-NOSVE2-NEXT: st1w { z0.s }, p0, [x2, x8, lsl #2] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 @@ -596,21 +594,21 @@ for.body.preheader: %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %wide.trip.count) %sub.diff = sub i64 %b12, %c13 %diff = sdiv i64 %sub.diff, 4 - %neg.compare = icmp slt i64 %sub.diff, -3 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat - %2 = and %active.lane.mask.alias, %active.lane.mask.entry br label %vector.body vector.body: %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %2 = and %active.lane.mask.alias, %active.lane.mask %3 = getelementptr inbounds i32, ptr %a, i64 %index - %wide.masked.load = tail call @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, %active.lane.mask, poison) + %wide.masked.load = tail call @llvm.masked.load.nxv4i32.p0(ptr %3, i32 4, %2, poison) %4 = getelementptr inbounds i32, ptr %b, i64 %index - %wide.masked.load14 = tail call @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, %active.lane.mask, poison) + %wide.masked.load14 = tail call @llvm.masked.load.nxv4i32.p0(ptr %4, i32 4, %2, poison) %5 = add %wide.masked.load14, %wide.masked.load %6 = getelementptr inbounds i32, ptr %c, i64 %index tail call void @llvm.masked.store.nxv4i32.p0( %5, ptr %6, i32 4, %active.lane.mask) @@ -630,18 +628,18 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: b.lt .LBB13_3 ; CHECK-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: whilewr p1.d, x1, x2 +; CHECK-NEXT: whilewr p0.d, x1, x2 ; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: whilelo p0.d, xzr, x8 -; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b +; CHECK-NEXT: whilelo p1.d, xzr, x8 ; CHECK-NEXT: .LBB13_2: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x9, lsl #3] -; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1, x9, lsl #3] +; CHECK-NEXT: and p2.b, p0/z, p0.b, p1.b +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x0, x9, lsl #3] +; CHECK-NEXT: ld1d { z1.d }, p2/z, [x1, x9, lsl #3] ; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: st1d { z0.d }, p0, [x2, x9, lsl #3] +; CHECK-NEXT: st1d { z0.d }, p1, [x2, x9, lsl #3] ; CHECK-NEXT: incd x9 -; CHECK-NEXT: whilelo p0.d, x9, x8 +; CHECK-NEXT: whilelo p1.d, x9, x8 ; CHECK-NEXT: b.mi .LBB13_2 ; CHECK-NEXT: .LBB13_3: // %for.cond.cleanup ; CHECK-NEXT: ret @@ -658,7 +656,7 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: add x11, x10, #7 ; CHECK-NOSVE2-NEXT: cmp x10, #0 ; CHECK-NOSVE2-NEXT: csel x11, x11, x10, lt -; CHECK-NOSVE2-NEXT: cmn x10, #7 +; CHECK-NOSVE2-NEXT: cmp x10, #1 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: asr x11, x11, #3 ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 @@ -666,11 +664,11 @@ define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x10 ; CHECK-NOSVE2-NEXT: cntd x10 ; CHECK-NOSVE2-NEXT: mov p1.b, p2/m, p2.b -; CHECK-NOSVE2-NEXT: and p0.b, p1/z, p1.b, p0.b ; CHECK-NOSVE2-NEXT: .LBB13_2: // %vector.body ; CHECK-NOSVE2-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] -; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] +; CHECK-NOSVE2-NEXT: and p2.b, p1/z, p1.b, p0.b +; CHECK-NOSVE2-NEXT: ld1d { z0.d }, p2/z, [x0, x8, lsl #3] +; CHECK-NOSVE2-NEXT: ld1d { z1.d }, p2/z, [x1, x8, lsl #3] ; CHECK-NOSVE2-NEXT: add z0.d, z1.d, z0.d ; CHECK-NOSVE2-NEXT: st1d { z0.d }, p0, [x2, x8, lsl #3] ; CHECK-NOSVE2-NEXT: add x8, x8, x10 @@ -691,21 +689,21 @@ for.body.preheader: %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %wide.trip.count) %sub.diff = sub i64 %b12, %c13 %diff = sdiv i64 %sub.diff, 8 - %neg.compare = icmp slt i64 %sub.diff, -7 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat - %2 = and %active.lane.mask.alias, %active.lane.mask.entry br label %vector.body vector.body: %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] - %active.lane.mask = phi [ %2, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ] + %2 = and %active.lane.mask.alias, %active.lane.mask %3 = getelementptr inbounds i64, ptr %a, i64 %index - %wide.masked.load = tail call @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, %active.lane.mask, poison) + %wide.masked.load = tail call @llvm.masked.load.nxv2i64.p0(ptr %3, i32 8, %2, poison) %4 = getelementptr inbounds i64, ptr %b, i64 %index - %wide.masked.load14 = tail call @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, %active.lane.mask, poison) + %wide.masked.load14 = tail call @llvm.masked.load.nxv2i64.p0(ptr %4, i32 8, %2, poison) %5 = add %wide.masked.load14, %wide.masked.load %6 = getelementptr inbounds i64, ptr %c, i64 %index tail call void @llvm.masked.store.nxv2i64.p0( %5, ptr %6, i32 8, %active.lane.mask) @@ -752,13 +750,13 @@ define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: sub x9, x0, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: cmp x9, #1 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: whilelo p0.b, xzr, x9 ; CHECK-NOSVE2-NEXT: sub x9, x1, x2 ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 ; CHECK-NOSVE2-NEXT: whilelo p1.b, xzr, x10 -; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: cmp x9, #1 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: whilelo p3.b, xzr, x9 ; CHECK-NOSVE2-NEXT: mov w9, w3 @@ -792,13 +790,13 @@ for.body.preheader: %b16 = ptrtoint ptr %b to i64 %wide.trip.count = zext nneg i32 %n to i64 %sub.diff = sub i64 %a15, %c14 - %neg.compare = icmp slt i64 %sub.diff, 0 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat %sub.diff18 = sub i64 %b16, %c14 - %neg.compare20 = icmp slt i64 %sub.diff18, 0 + %neg.compare20 = icmp sle i64 %sub.diff18, 0 %.splatinsert21 = insertelement poison, i1 %neg.compare20, i64 0 %.splat22 = shufflevector %.splatinsert21, poison, zeroinitializer %ptr.diff.lane.mask23 = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 %sub.diff18) @@ -864,7 +862,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: // %bb.1: // %for.body.preheader ; CHECK-NOSVE2-NEXT: sub x9, x0, x2 ; CHECK-NOSVE2-NEXT: mov x8, xzr -; CHECK-NOSVE2-NEXT: cmn x9, #1 +; CHECK-NOSVE2-NEXT: cmp x9, #1 ; CHECK-NOSVE2-NEXT: add x9, x9, x9, lsr #63 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 @@ -873,7 +871,7 @@ define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: sub x10, x1, x2 ; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x9 ; CHECK-NOSVE2-NEXT: add x9, x10, x10, lsr #63 -; CHECK-NOSVE2-NEXT: cmn x10, #1 +; CHECK-NOSVE2-NEXT: cmp x10, #1 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: asr x9, x9, #1 ; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b @@ -909,14 +907,14 @@ for.body.preheader: %wide.trip.count = zext nneg i32 %n to i64 %sub.diff = sub i64 %a15, %c14 %diff = sdiv i64 %sub.diff, 2 - %neg.compare = icmp slt i64 %sub.diff, -1 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat %sub.diff18 = sub i64 %b16, %c14 %diff19 = sdiv i64 %sub.diff18, 2 - %neg.compare20 = icmp slt i64 %sub.diff18, -1 + %neg.compare20 = icmp sle i64 %sub.diff18, 0 %.splatinsert21 = insertelement poison, i1 %neg.compare20, i64 0 %.splat22 = shufflevector %.splatinsert21, poison, zeroinitializer %ptr.diff.lane.mask23 = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %diff19) @@ -985,7 +983,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: add x10, x9, #3 ; CHECK-NOSVE2-NEXT: cmp x9, #0 ; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt -; CHECK-NOSVE2-NEXT: cmn x9, #3 +; CHECK-NOSVE2-NEXT: cmp x9, #1 ; CHECK-NOSVE2-NEXT: asr x9, x10, #2 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 @@ -995,7 +993,7 @@ define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: add x10, x9, #3 ; CHECK-NOSVE2-NEXT: cmp x9, #0 ; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt -; CHECK-NOSVE2-NEXT: cmn x9, #3 +; CHECK-NOSVE2-NEXT: cmp x9, #1 ; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NOSVE2-NEXT: cset w9, lt ; CHECK-NOSVE2-NEXT: asr x10, x10, #2 @@ -1031,14 +1029,14 @@ for.body.preheader: %wide.trip.count = zext nneg i32 %n to i64 %sub.diff = sub i64 %a13, %c12 %diff = sdiv i64 %sub.diff, 4 - %neg.compare = icmp slt i64 %sub.diff, -3 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat %sub.diff16 = sub i64 %b14, %c12 %diff17 = sdiv i64 %sub.diff16, 4 - %neg.compare18 = icmp slt i64 %sub.diff16, -3 + %neg.compare18 = icmp sle i64 %sub.diff16, 0 %.splatinsert19 = insertelement poison, i1 %neg.compare18, i64 0 %.splat20 = shufflevector %.splatinsert19, poison, zeroinitializer %ptr.diff.lane.mask21 = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff17) @@ -1107,7 +1105,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: add x10, x9, #7 ; CHECK-NOSVE2-NEXT: cmp x9, #0 ; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt -; CHECK-NOSVE2-NEXT: cmn x9, #7 +; CHECK-NOSVE2-NEXT: cmp x9, #1 ; CHECK-NOSVE2-NEXT: asr x9, x10, #3 ; CHECK-NOSVE2-NEXT: cset w10, lt ; CHECK-NOSVE2-NEXT: sbfx x10, x10, #0, #1 @@ -1117,7 +1115,7 @@ define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NOSVE2-NEXT: add x10, x9, #7 ; CHECK-NOSVE2-NEXT: cmp x9, #0 ; CHECK-NOSVE2-NEXT: csel x10, x10, x9, lt -; CHECK-NOSVE2-NEXT: cmn x9, #7 +; CHECK-NOSVE2-NEXT: cmp x9, #1 ; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NOSVE2-NEXT: cset w9, lt ; CHECK-NOSVE2-NEXT: asr x10, x10, #3 @@ -1153,14 +1151,14 @@ for.body.preheader: %wide.trip.count = zext nneg i32 %n to i64 %sub.diff = sub i64 %a13, %c12 %diff = sdiv i64 %sub.diff, 8 - %neg.compare = icmp slt i64 %sub.diff, -7 + %neg.compare = icmp sle i64 %sub.diff, 0 %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 %.splat = shufflevector %.splatinsert, poison, zeroinitializer %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat %sub.diff16 = sub i64 %b14, %c12 %diff17 = sdiv i64 %sub.diff16, 8 - %neg.compare18 = icmp slt i64 %sub.diff16, -7 + %neg.compare18 = icmp sle i64 %sub.diff16, 0 %.splatinsert19 = insertelement poison, i1 %neg.compare18, i64 0 %.splat20 = shufflevector %.splatinsert19, poison, zeroinitializer %ptr.diff.lane.mask21 = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff17) From 7844e574d2338f3c55460e9d171b48c934556cdc Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 5 Nov 2024 15:49:55 +0000 Subject: [PATCH 3/4] Check mask size --- .../Target/AArch64/AArch64ISelLowering.cpp | 22 ++ llvm/test/CodeGen/AArch64/whilewr.ll | 352 +++++++++++++++++- 2 files changed, 373 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index edf86fd7f806f..707c29cea59f8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14164,6 +14164,7 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, const AArch64Subtarget &Subtarget) { if (!Subtarget.hasSVE2()) return SDValue(); + unsigned MaskNumElements = Op.getValueType().getVectorMinNumElements(); SDValue LaneMask = Op.getOperand(0); SDValue Splat = Op.getOperand(1); @@ -14192,6 +14193,27 @@ SDValue tryWhileWRFromOR(SDValue Op, SelectionDAG &DAG, EltSize = 1 << DiffDivConst->getZExtValue(); } + switch (EltSize) { + case 1: + if (MaskNumElements != 16) + return SDValue(); + break; + case 2: + if (MaskNumElements != 8) + return SDValue(); + break; + case 4: + if (MaskNumElements != 4) + return SDValue(); + break; + case 8: + if (MaskNumElements != 2) + return SDValue(); + break; + default: + return SDValue(); + } + SDValue Diff = Cmp.getOperand(0); SDValue NonAbsDiff = Diff; bool WriteAfterRead = true; diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll index 0bdb4b726731f..a67d5920092f6 100644 --- a/llvm/test/CodeGen/AArch64/whilewr.ll +++ b/llvm/test/CodeGen/AArch64/whilewr.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-NOSVE2 +; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK,CHECK-NOSVE2 define @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-LABEL: whilewr_8: @@ -1190,6 +1190,356 @@ for.cond.cleanup: ret void } +define @no_whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: no_whilewr_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub x8, x1, x2 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w9, lt +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: sbfx x8, x9, #0, #1 +; CHECK-NEXT: whilelo p1.h, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: no_whilewr_8: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: sub x8, x1, x2 +; CHECK-NOSVE2-NEXT: cmp x8, #1 +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8 +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %c14 = ptrtoint ptr %c to i64 + %b15 = ptrtoint ptr %b to i64 + %sub.diff = sub i64 %b15, %c14 + %neg.compare = icmp sle i64 %sub.diff, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %sub.diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + +define @no_whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: no_whilewr_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub x8, x1, x2 +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NEXT: cset w9, lt +; CHECK-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NEXT: asr x8, x8, #1 +; CHECK-NEXT: whilelo p0.s, xzr, x9 +; CHECK-NEXT: whilelo p1.s, xzr, x8 +; CHECK-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: no_whilewr_16: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: sub x8, x1, x2 +; CHECK-NOSVE2-NEXT: cmp x8, #1 +; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NOSVE2-NEXT: asr x8, x8, #1 +; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 +; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x8 +; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b14 = ptrtoint ptr %b to i64 + %c15 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %b14, %c15 + %diff = sdiv i64 %sub.diff, 2 + %neg.compare = icmp sle i64 %sub.diff, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + +define @no_whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: no_whilewr_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 +; CHECK-NEXT: sub x9, x1, x2 +; CHECK-NEXT: movk x8, #21846 +; CHECK-NEXT: cmp x9, #1 +; CHECK-NEXT: smulh x8, x9, x8 +; CHECK-NEXT: cset w9, lt +; CHECK-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NEXT: whilelo p0.d, xzr, x9 +; CHECK-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: no_whilewr_32: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 +; CHECK-NOSVE2-NEXT: sub x9, x1, x2 +; CHECK-NOSVE2-NEXT: movk x8, #21846 +; CHECK-NOSVE2-NEXT: cmp x9, #1 +; CHECK-NOSVE2-NEXT: smulh x8, x9, x8 +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 +; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b14 = ptrtoint ptr %b to i64 + %c15 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %b14, %c15 + %diff = sdiv i64 %sub.diff, 3 + %neg.compare = icmp sle i64 %sub.diff, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + +define @no_whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: no_whilewr_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: sub x8, x1, x2 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: add x9, x8, #3 +; CHECK-NEXT: cmp x8, #0 +; CHECK-NEXT: csel x9, x9, x8, lt +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: asr x9, x9, #2 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: mov z1.d, x9 +; CHECK-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NEXT: cmphi p0.d, p0/z, z1.d, z0.d +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: no_whilewr_64: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: sub x8, x1, x2 +; CHECK-NOSVE2-NEXT: index z0.d, #0, #1 +; CHECK-NOSVE2-NEXT: ptrue p0.d +; CHECK-NOSVE2-NEXT: add x9, x8, #3 +; CHECK-NOSVE2-NEXT: cmp x8, #0 +; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt +; CHECK-NOSVE2-NEXT: cmp x8, #1 +; CHECK-NOSVE2-NEXT: asr x9, x9, #2 +; CHECK-NOSVE2-NEXT: cset w8, lt +; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NOSVE2-NEXT: mov z1.d, x9 +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d +; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b +; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b14 = ptrtoint ptr %b to i64 + %c15 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %b14, %c15 + %diff = sdiv i64 %sub.diff, 4 + %neg.compare = icmp sle i64 %sub.diff, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + +define @no_whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: no_whilerw_8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x2 +; CHECK-NEXT: cneg x8, x8, mi +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: cset w9, lt +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: sbfx x8, x9, #0, #1 +; CHECK-NEXT: whilelo p1.h, xzr, x8 +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: no_whilerw_8: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: subs x8, x1, x2 +; CHECK-NOSVE2-NEXT: cneg x8, x8, mi +; CHECK-NOSVE2-NEXT: cmp x8, #1 +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8 +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %c14 = ptrtoint ptr %c to i64 + %b15 = ptrtoint ptr %b to i64 + %sub.diff = sub i64 %b15, %c14 + %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false) + %neg.compare = icmp sle i64 %0, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %0) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + +define @no_whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: no_whilerw_16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x2 +; CHECK-NEXT: cneg x9, x8, mi +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: add x8, x9, x9, lsr #63 +; CHECK-NEXT: cset w9, lt +; CHECK-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NEXT: asr x8, x8, #1 +; CHECK-NEXT: whilelo p0.s, xzr, x9 +; CHECK-NEXT: whilelo p1.s, xzr, x8 +; CHECK-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: no_whilerw_16: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: subs x8, x1, x2 +; CHECK-NOSVE2-NEXT: cneg x9, x8, mi +; CHECK-NOSVE2-NEXT: cmp x8, #1 +; CHECK-NOSVE2-NEXT: add x8, x9, x9, lsr #63 +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NOSVE2-NEXT: asr x8, x8, #1 +; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 +; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x8 +; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b14 = ptrtoint ptr %b to i64 + %c15 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %b14, %c15 + %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false) + %diff = sdiv i64 %0, 2 + %neg.compare = icmp sle i64 %sub.diff, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + +define @no_whilerw_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: no_whilerw_32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 +; CHECK-NEXT: subs x9, x1, x2 +; CHECK-NEXT: movk x8, #21846 +; CHECK-NEXT: cneg x10, x9, mi +; CHECK-NEXT: cmp x9, #1 +; CHECK-NEXT: smulh x8, x10, x8 +; CHECK-NEXT: cset w9, lt +; CHECK-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NEXT: whilelo p0.d, xzr, x9 +; CHECK-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: no_whilerw_32: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 +; CHECK-NOSVE2-NEXT: subs x9, x1, x2 +; CHECK-NOSVE2-NEXT: movk x8, #21846 +; CHECK-NOSVE2-NEXT: cneg x10, x9, mi +; CHECK-NOSVE2-NEXT: cmp x9, #1 +; CHECK-NOSVE2-NEXT: smulh x8, x10, x8 +; CHECK-NOSVE2-NEXT: cset w9, lt +; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 +; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 +; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63 +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b14 = ptrtoint ptr %b to i64 + %c15 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %b14, %c15 + %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false) + %diff = sdiv i64 %0, 3 + %neg.compare = icmp sle i64 %sub.diff, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + +define @no_whilerw_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { +; CHECK-LABEL: no_whilerw_64: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: subs x8, x1, x2 +; CHECK-NEXT: index z0.d, #0, #1 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: cneg x9, x8, mi +; CHECK-NEXT: add x10, x9, #3 +; CHECK-NEXT: cmp x9, #0 +; CHECK-NEXT: csel x9, x10, x9, lt +; CHECK-NEXT: cmp x8, #1 +; CHECK-NEXT: asr x9, x9, #2 +; CHECK-NEXT: cset w8, lt +; CHECK-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NEXT: mov z1.d, x9 +; CHECK-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NEXT: cmphi p0.d, p0/z, z1.d, z0.d +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NEXT: ret +; +; CHECK-NOSVE2-LABEL: no_whilerw_64: +; CHECK-NOSVE2: // %bb.0: // %entry +; CHECK-NOSVE2-NEXT: subs x8, x1, x2 +; CHECK-NOSVE2-NEXT: index z0.d, #0, #1 +; CHECK-NOSVE2-NEXT: ptrue p0.d +; CHECK-NOSVE2-NEXT: cneg x9, x8, mi +; CHECK-NOSVE2-NEXT: add x10, x9, #3 +; CHECK-NOSVE2-NEXT: cmp x9, #0 +; CHECK-NOSVE2-NEXT: csel x9, x10, x9, lt +; CHECK-NOSVE2-NEXT: cmp x8, #1 +; CHECK-NOSVE2-NEXT: asr x9, x9, #2 +; CHECK-NOSVE2-NEXT: cset w8, lt +; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 +; CHECK-NOSVE2-NEXT: mov z1.d, x9 +; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8 +; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d +; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b +; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b +; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b +; CHECK-NOSVE2-NEXT: ret +entry: + %b14 = ptrtoint ptr %b to i64 + %c15 = ptrtoint ptr %c to i64 + %sub.diff = sub i64 %b14, %c15 + %0 = tail call i64 @llvm.abs.i64(i64 %sub.diff, i1 false) + %diff = sdiv i64 %0, 4 + %neg.compare = icmp sle i64 %sub.diff, 0 + %.splatinsert = insertelement poison, i1 %neg.compare, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %ptr.diff.lane.mask = tail call @llvm.get.active.lane.mask.nxv1i1.i64(i64 0, i64 %diff) + %active.lane.mask.alias = or %ptr.diff.lane.mask, %.splat + ret %active.lane.mask.alias +} + declare i64 @llvm.vscale.i64() declare @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64) From 41c2bcd48b15ed196bb482a48824a8b427074b79 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 5 Nov 2024 15:50:39 +0000 Subject: [PATCH 4/4] Combine identical CHECK statements --- llvm/test/CodeGen/AArch64/whilewr.ll | 624 +++++++++++---------------- 1 file changed, 241 insertions(+), 383 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/whilewr.ll b/llvm/test/CodeGen/AArch64/whilewr.ll index a67d5920092f6..006c1c180ac45 100644 --- a/llvm/test/CodeGen/AArch64/whilewr.ll +++ b/llvm/test/CodeGen/AArch64/whilewr.ll @@ -1,13 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 -; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s -; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefix=CHECK,CHECK-NOSVE2 +; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve2 -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SVE2 +; RUN: llc %s -mtriple=aarch64-linux-gnu -mattr=+sve -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOSVE2 define @whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilewr p0.b, x1, x2 -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilewr_8: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: whilewr p0.b, x1, x2 +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilewr_8: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 @@ -31,11 +31,11 @@ entry: } define @whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilerw_8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilerw p0.b, x1, x2 -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilerw_8: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: whilerw p0.b, x1, x2 +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilerw_8: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: subs x8, x1, x2 @@ -61,11 +61,11 @@ entry: } define @whilewr_commutative(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_commutative: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilewr p0.b, x1, x2 -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilewr_commutative: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: whilewr p0.b, x1, x2 +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilewr_commutative: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 @@ -89,11 +89,11 @@ entry: } define @whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilewr p0.h, x1, x2 -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilewr_16: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: whilewr p0.h, x1, x2 +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilewr_16: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 @@ -120,11 +120,11 @@ entry: } define @whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilerw_16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilerw p0.h, x1, x2 -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilerw_16: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: whilerw p0.h, x1, x2 +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilerw_16: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: subs x8, x1, x2 @@ -153,11 +153,11 @@ entry: } define @whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilewr p0.s, x1, x2 -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilewr_32: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: whilewr p0.s, x1, x2 +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilewr_32: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 @@ -186,11 +186,11 @@ entry: } define @whilerw_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilerw_32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilerw p0.s, x1, x2 -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilerw_32: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: whilerw p0.s, x1, x2 +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilerw_32: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: subs x8, x1, x2 @@ -220,11 +220,11 @@ entry: } define @whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilewr p0.d, x1, x2 -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilewr_64: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: whilewr p0.d, x1, x2 +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilewr_64: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: sub x8, x1, x2 @@ -253,11 +253,11 @@ entry: } define @whilerw_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilerw_64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: whilerw p0.d, x1, x2 -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilerw_64: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: whilerw p0.d, x1, x2 +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilerw_64: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: subs x8, x1, x2 @@ -306,26 +306,6 @@ define @no_whilewr_128(ptr noalias %a, ptr %b, ptr %c, i32 %n) ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: no_whilewr_128: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: sub x8, x1, x2 -; CHECK-NOSVE2-NEXT: index z0.d, #0, #1 -; CHECK-NOSVE2-NEXT: ptrue p0.d -; CHECK-NOSVE2-NEXT: add x9, x8, #15 -; CHECK-NOSVE2-NEXT: cmp x8, #0 -; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt -; CHECK-NOSVE2-NEXT: cmp x8, #1 -; CHECK-NOSVE2-NEXT: asr x9, x9, #4 -; CHECK-NOSVE2-NEXT: cset w8, lt -; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NOSVE2-NEXT: mov z1.d, x9 -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8 -; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d -; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b -; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: ret entry: %b12 = ptrtoint ptr %b to i64 %c13 = ptrtoint ptr %c to i64 @@ -340,30 +320,30 @@ entry: } define void @whilewr_loop_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB10_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.b, x1, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilelo p1.b, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.b -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB10_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8] -; CHECK-NEXT: add z0.b, z1.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.b, x8, x9 -; CHECK-NEXT: b.mi .LBB10_2 -; CHECK-NEXT: .LBB10_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilewr_loop_8: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: cmp w3, #1 +; CHECK-SVE2-NEXT: b.lt .LBB10_3 +; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2-NEXT: whilewr p0.b, x1, x2 +; CHECK-SVE2-NEXT: mov w9, w3 +; CHECK-SVE2-NEXT: mov x8, xzr +; CHECK-SVE2-NEXT: whilelo p1.b, xzr, x9 +; CHECK-SVE2-NEXT: cntp x10, p0, p0.b +; CHECK-SVE2-NEXT: and x10, x10, #0xff +; CHECK-SVE2-NEXT: .LBB10_2: // %vector.body +; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-SVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] +; CHECK-SVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8] +; CHECK-SVE2-NEXT: add z0.b, z1.b, z0.b +; CHECK-SVE2-NEXT: st1b { z0.b }, p1, [x2, x8] +; CHECK-SVE2-NEXT: add x8, x8, x10 +; CHECK-SVE2-NEXT: whilelo p1.b, x8, x9 +; CHECK-SVE2-NEXT: b.mi .LBB10_2 +; CHECK-SVE2-NEXT: .LBB10_3: // %for.cond.cleanup +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilewr_loop_8: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 @@ -434,28 +414,28 @@ for.cond.cleanup: } define void @whilewr_loop_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB11_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: whilewr p0.h, x1, x2 -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: whilelo p1.h, xzr, x8 -; CHECK-NEXT: .LBB11_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p2.b, p0/z, p0.b, p1.b -; CHECK-NEXT: ld1h { z0.h }, p2/z, [x0, x9, lsl #1] -; CHECK-NEXT: ld1h { z1.h }, p2/z, [x1, x9, lsl #1] -; CHECK-NEXT: add z0.h, z1.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2, x9, lsl #1] -; CHECK-NEXT: inch x9 -; CHECK-NEXT: whilelo p1.h, x9, x8 -; CHECK-NEXT: b.mi .LBB11_2 -; CHECK-NEXT: .LBB11_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilewr_loop_16: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: cmp w3, #1 +; CHECK-SVE2-NEXT: b.lt .LBB11_3 +; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2-NEXT: mov w8, w3 +; CHECK-SVE2-NEXT: whilewr p0.h, x1, x2 +; CHECK-SVE2-NEXT: mov x9, xzr +; CHECK-SVE2-NEXT: whilelo p1.h, xzr, x8 +; CHECK-SVE2-NEXT: .LBB11_2: // %vector.body +; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2-NEXT: and p2.b, p0/z, p0.b, p1.b +; CHECK-SVE2-NEXT: ld1h { z0.h }, p2/z, [x0, x9, lsl #1] +; CHECK-SVE2-NEXT: ld1h { z1.h }, p2/z, [x1, x9, lsl #1] +; CHECK-SVE2-NEXT: add z0.h, z1.h, z0.h +; CHECK-SVE2-NEXT: st1h { z0.h }, p1, [x2, x9, lsl #1] +; CHECK-SVE2-NEXT: inch x9 +; CHECK-SVE2-NEXT: whilelo p1.h, x9, x8 +; CHECK-SVE2-NEXT: b.mi .LBB11_2 +; CHECK-SVE2-NEXT: .LBB11_3: // %for.cond.cleanup +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilewr_loop_16: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 @@ -527,28 +507,28 @@ for.cond.cleanup: } define void @whilewr_loop_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB12_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: whilewr p0.s, x1, x2 -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: whilelo p1.s, xzr, x8 -; CHECK-NEXT: .LBB12_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p2.b, p0/z, p0.b, p1.b -; CHECK-NEXT: ld1w { z0.s }, p2/z, [x0, x9, lsl #2] -; CHECK-NEXT: ld1w { z1.s }, p2/z, [x1, x9, lsl #2] -; CHECK-NEXT: add z0.s, z1.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2, x9, lsl #2] -; CHECK-NEXT: incw x9 -; CHECK-NEXT: whilelo p1.s, x9, x8 -; CHECK-NEXT: b.mi .LBB12_2 -; CHECK-NEXT: .LBB12_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilewr_loop_32: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: cmp w3, #1 +; CHECK-SVE2-NEXT: b.lt .LBB12_3 +; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2-NEXT: mov w8, w3 +; CHECK-SVE2-NEXT: whilewr p0.s, x1, x2 +; CHECK-SVE2-NEXT: mov x9, xzr +; CHECK-SVE2-NEXT: whilelo p1.s, xzr, x8 +; CHECK-SVE2-NEXT: .LBB12_2: // %vector.body +; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2-NEXT: and p2.b, p0/z, p0.b, p1.b +; CHECK-SVE2-NEXT: ld1w { z0.s }, p2/z, [x0, x9, lsl #2] +; CHECK-SVE2-NEXT: ld1w { z1.s }, p2/z, [x1, x9, lsl #2] +; CHECK-SVE2-NEXT: add z0.s, z1.s, z0.s +; CHECK-SVE2-NEXT: st1w { z0.s }, p1, [x2, x9, lsl #2] +; CHECK-SVE2-NEXT: incw x9 +; CHECK-SVE2-NEXT: whilelo p1.s, x9, x8 +; CHECK-SVE2-NEXT: b.mi .LBB12_2 +; CHECK-SVE2-NEXT: .LBB12_3: // %for.cond.cleanup +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilewr_loop_32: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 @@ -622,28 +602,28 @@ for.cond.cleanup: } define void @whilewr_loop_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB13_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: mov w8, w3 -; CHECK-NEXT: whilewr p0.d, x1, x2 -; CHECK-NEXT: mov x9, xzr -; CHECK-NEXT: whilelo p1.d, xzr, x8 -; CHECK-NEXT: .LBB13_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p2.b, p0/z, p0.b, p1.b -; CHECK-NEXT: ld1d { z0.d }, p2/z, [x0, x9, lsl #3] -; CHECK-NEXT: ld1d { z1.d }, p2/z, [x1, x9, lsl #3] -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2, x9, lsl #3] -; CHECK-NEXT: incd x9 -; CHECK-NEXT: whilelo p1.d, x9, x8 -; CHECK-NEXT: b.mi .LBB13_2 -; CHECK-NEXT: .LBB13_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilewr_loop_64: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: cmp w3, #1 +; CHECK-SVE2-NEXT: b.lt .LBB13_3 +; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2-NEXT: mov w8, w3 +; CHECK-SVE2-NEXT: whilewr p0.d, x1, x2 +; CHECK-SVE2-NEXT: mov x9, xzr +; CHECK-SVE2-NEXT: whilelo p1.d, xzr, x8 +; CHECK-SVE2-NEXT: .LBB13_2: // %vector.body +; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2-NEXT: and p2.b, p0/z, p0.b, p1.b +; CHECK-SVE2-NEXT: ld1d { z0.d }, p2/z, [x0, x9, lsl #3] +; CHECK-SVE2-NEXT: ld1d { z1.d }, p2/z, [x1, x9, lsl #3] +; CHECK-SVE2-NEXT: add z0.d, z1.d, z0.d +; CHECK-SVE2-NEXT: st1d { z0.d }, p1, [x2, x9, lsl #3] +; CHECK-SVE2-NEXT: incd x9 +; CHECK-SVE2-NEXT: whilelo p1.d, x9, x8 +; CHECK-SVE2-NEXT: b.mi .LBB13_2 +; CHECK-SVE2-NEXT: .LBB13_3: // %for.cond.cleanup +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilewr_loop_64: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 @@ -717,32 +697,32 @@ for.cond.cleanup: } define void @whilewr_loop_multiple_8(ptr %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_multiple_8: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB14_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.b, x0, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilewr p1.b, x1, x2 -; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NEXT: whilelo p1.b, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.b -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB14_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1b { z0.b }, p1/z, [x0, x8] -; CHECK-NEXT: ld1b { z1.b }, p1/z, [x1, x8] -; CHECK-NEXT: add z0.b, z1.b, z0.b -; CHECK-NEXT: st1b { z0.b }, p1, [x2, x8] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.b, x8, x9 -; CHECK-NEXT: b.mi .LBB14_2 -; CHECK-NEXT: .LBB14_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilewr_loop_multiple_8: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: cmp w3, #1 +; CHECK-SVE2-NEXT: b.lt .LBB14_3 +; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2-NEXT: whilewr p0.b, x0, x2 +; CHECK-SVE2-NEXT: mov w9, w3 +; CHECK-SVE2-NEXT: mov x8, xzr +; CHECK-SVE2-NEXT: whilewr p1.b, x1, x2 +; CHECK-SVE2-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-SVE2-NEXT: whilelo p1.b, xzr, x9 +; CHECK-SVE2-NEXT: cntp x10, p0, p0.b +; CHECK-SVE2-NEXT: and x10, x10, #0xff +; CHECK-SVE2-NEXT: .LBB14_2: // %vector.body +; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-SVE2-NEXT: ld1b { z0.b }, p1/z, [x0, x8] +; CHECK-SVE2-NEXT: ld1b { z1.b }, p1/z, [x1, x8] +; CHECK-SVE2-NEXT: add z0.b, z1.b, z0.b +; CHECK-SVE2-NEXT: st1b { z0.b }, p1, [x2, x8] +; CHECK-SVE2-NEXT: add x8, x8, x10 +; CHECK-SVE2-NEXT: whilelo p1.b, x8, x9 +; CHECK-SVE2-NEXT: b.mi .LBB14_2 +; CHECK-SVE2-NEXT: .LBB14_3: // %for.cond.cleanup +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_8: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 @@ -829,32 +809,32 @@ for.cond.cleanup: } define void @whilewr_loop_multiple_16(ptr %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_multiple_16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB15_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.h, x0, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilewr p1.h, x1, x2 -; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NEXT: whilelo p1.h, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.h -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB15_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] -; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1] -; CHECK-NEXT: add z0.h, z1.h, z0.h -; CHECK-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.h, x8, x9 -; CHECK-NEXT: b.mi .LBB15_2 -; CHECK-NEXT: .LBB15_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilewr_loop_multiple_16: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: cmp w3, #1 +; CHECK-SVE2-NEXT: b.lt .LBB15_3 +; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2-NEXT: whilewr p0.h, x0, x2 +; CHECK-SVE2-NEXT: mov w9, w3 +; CHECK-SVE2-NEXT: mov x8, xzr +; CHECK-SVE2-NEXT: whilewr p1.h, x1, x2 +; CHECK-SVE2-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-SVE2-NEXT: whilelo p1.h, xzr, x9 +; CHECK-SVE2-NEXT: cntp x10, p0, p0.h +; CHECK-SVE2-NEXT: and x10, x10, #0xff +; CHECK-SVE2-NEXT: .LBB15_2: // %vector.body +; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-SVE2-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] +; CHECK-SVE2-NEXT: ld1h { z1.h }, p1/z, [x1, x8, lsl #1] +; CHECK-SVE2-NEXT: add z0.h, z1.h, z0.h +; CHECK-SVE2-NEXT: st1h { z0.h }, p1, [x2, x8, lsl #1] +; CHECK-SVE2-NEXT: add x8, x8, x10 +; CHECK-SVE2-NEXT: whilelo p1.h, x8, x9 +; CHECK-SVE2-NEXT: b.mi .LBB15_2 +; CHECK-SVE2-NEXT: .LBB15_3: // %for.cond.cleanup +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_16: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 @@ -947,32 +927,32 @@ for.cond.cleanup: } define void @whilewr_loop_multiple_32(ptr %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_multiple_32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB16_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.s, x0, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilewr p1.s, x1, x2 -; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NEXT: whilelo p1.s, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.s -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB16_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] -; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] -; CHECK-NEXT: add z0.s, z1.s, z0.s -; CHECK-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.s, x8, x9 -; CHECK-NEXT: b.mi .LBB16_2 -; CHECK-NEXT: .LBB16_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilewr_loop_multiple_32: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: cmp w3, #1 +; CHECK-SVE2-NEXT: b.lt .LBB16_3 +; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2-NEXT: whilewr p0.s, x0, x2 +; CHECK-SVE2-NEXT: mov w9, w3 +; CHECK-SVE2-NEXT: mov x8, xzr +; CHECK-SVE2-NEXT: whilewr p1.s, x1, x2 +; CHECK-SVE2-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-SVE2-NEXT: whilelo p1.s, xzr, x9 +; CHECK-SVE2-NEXT: cntp x10, p0, p0.s +; CHECK-SVE2-NEXT: and x10, x10, #0xff +; CHECK-SVE2-NEXT: .LBB16_2: // %vector.body +; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-SVE2-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] +; CHECK-SVE2-NEXT: ld1w { z1.s }, p1/z, [x1, x8, lsl #2] +; CHECK-SVE2-NEXT: add z0.s, z1.s, z0.s +; CHECK-SVE2-NEXT: st1w { z0.s }, p1, [x2, x8, lsl #2] +; CHECK-SVE2-NEXT: add x8, x8, x10 +; CHECK-SVE2-NEXT: whilelo p1.s, x8, x9 +; CHECK-SVE2-NEXT: b.mi .LBB16_2 +; CHECK-SVE2-NEXT: .LBB16_3: // %for.cond.cleanup +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_32: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 @@ -1069,32 +1049,32 @@ for.cond.cleanup: } define void @whilewr_loop_multiple_64(ptr %a, ptr %b, ptr %c, i32 %n) { -; CHECK-LABEL: whilewr_loop_multiple_64: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: cmp w3, #1 -; CHECK-NEXT: b.lt .LBB17_3 -; CHECK-NEXT: // %bb.1: // %for.body.preheader -; CHECK-NEXT: whilewr p0.d, x0, x2 -; CHECK-NEXT: mov w9, w3 -; CHECK-NEXT: mov x8, xzr -; CHECK-NEXT: whilewr p1.d, x1, x2 -; CHECK-NEXT: and p0.b, p0/z, p0.b, p1.b -; CHECK-NEXT: whilelo p1.d, xzr, x9 -; CHECK-NEXT: cntp x10, p0, p0.d -; CHECK-NEXT: and x10, x10, #0xff -; CHECK-NEXT: .LBB17_2: // %vector.body -; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: and p1.b, p1/z, p1.b, p0.b -; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] -; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] -; CHECK-NEXT: add z0.d, z1.d, z0.d -; CHECK-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] -; CHECK-NEXT: add x8, x8, x10 -; CHECK-NEXT: whilelo p1.d, x8, x9 -; CHECK-NEXT: b.mi .LBB17_2 -; CHECK-NEXT: .LBB17_3: // %for.cond.cleanup -; CHECK-NEXT: ret -; +; CHECK-SVE2-LABEL: whilewr_loop_multiple_64: +; CHECK-SVE2: // %bb.0: // %entry +; CHECK-SVE2-NEXT: cmp w3, #1 +; CHECK-SVE2-NEXT: b.lt .LBB17_3 +; CHECK-SVE2-NEXT: // %bb.1: // %for.body.preheader +; CHECK-SVE2-NEXT: whilewr p0.d, x0, x2 +; CHECK-SVE2-NEXT: mov w9, w3 +; CHECK-SVE2-NEXT: mov x8, xzr +; CHECK-SVE2-NEXT: whilewr p1.d, x1, x2 +; CHECK-SVE2-NEXT: and p0.b, p0/z, p0.b, p1.b +; CHECK-SVE2-NEXT: whilelo p1.d, xzr, x9 +; CHECK-SVE2-NEXT: cntp x10, p0, p0.d +; CHECK-SVE2-NEXT: and x10, x10, #0xff +; CHECK-SVE2-NEXT: .LBB17_2: // %vector.body +; CHECK-SVE2-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-SVE2-NEXT: and p1.b, p1/z, p1.b, p0.b +; CHECK-SVE2-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] +; CHECK-SVE2-NEXT: ld1d { z1.d }, p1/z, [x1, x8, lsl #3] +; CHECK-SVE2-NEXT: add z0.d, z1.d, z0.d +; CHECK-SVE2-NEXT: st1d { z0.d }, p1, [x2, x8, lsl #3] +; CHECK-SVE2-NEXT: add x8, x8, x10 +; CHECK-SVE2-NEXT: whilelo p1.d, x8, x9 +; CHECK-SVE2-NEXT: b.mi .LBB17_2 +; CHECK-SVE2-NEXT: .LBB17_3: // %for.cond.cleanup +; CHECK-SVE2-NEXT: ret + ; CHECK-NOSVE2-LABEL: whilewr_loop_multiple_64: ; CHECK-NOSVE2: // %bb.0: // %entry ; CHECK-NOSVE2-NEXT: cmp w3, #1 @@ -1201,17 +1181,6 @@ define @no_whilewr_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: whilelo p1.h, xzr, x8 ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: no_whilewr_8: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: sub x8, x1, x2 -; CHECK-NOSVE2-NEXT: cmp x8, #1 -; CHECK-NOSVE2-NEXT: cset w9, lt -; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x8 -; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8 -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: ret entry: %c14 = ptrtoint ptr %c to i64 %b15 = ptrtoint ptr %b to i64 @@ -1237,19 +1206,6 @@ define @no_whilewr_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) ; CHECK-NEXT: whilelo p1.s, xzr, x8 ; CHECK-NEXT: mov p0.b, p1/m, p1.b ; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: no_whilewr_16: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: sub x8, x1, x2 -; CHECK-NOSVE2-NEXT: cmp x8, #1 -; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63 -; CHECK-NOSVE2-NEXT: cset w9, lt -; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 -; CHECK-NOSVE2-NEXT: asr x8, x8, #1 -; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 -; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x8 -; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b -; CHECK-NOSVE2-NEXT: ret entry: %b14 = ptrtoint ptr %b to i64 %c15 = ptrtoint ptr %c to i64 @@ -1278,21 +1234,6 @@ define @no_whilewr_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) ; CHECK-NEXT: whilelo p1.d, xzr, x8 ; CHECK-NEXT: mov p0.b, p1/m, p1.b ; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: no_whilewr_32: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 -; CHECK-NOSVE2-NEXT: sub x9, x1, x2 -; CHECK-NOSVE2-NEXT: movk x8, #21846 -; CHECK-NOSVE2-NEXT: cmp x9, #1 -; CHECK-NOSVE2-NEXT: smulh x8, x9, x8 -; CHECK-NOSVE2-NEXT: cset w9, lt -; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 -; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63 -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8 -; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b -; CHECK-NOSVE2-NEXT: ret entry: %b14 = ptrtoint ptr %b to i64 %c15 = ptrtoint ptr %c to i64 @@ -1326,26 +1267,6 @@ define @no_whilewr_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: no_whilewr_64: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: sub x8, x1, x2 -; CHECK-NOSVE2-NEXT: index z0.d, #0, #1 -; CHECK-NOSVE2-NEXT: ptrue p0.d -; CHECK-NOSVE2-NEXT: add x9, x8, #3 -; CHECK-NOSVE2-NEXT: cmp x8, #0 -; CHECK-NOSVE2-NEXT: csel x9, x9, x8, lt -; CHECK-NOSVE2-NEXT: cmp x8, #1 -; CHECK-NOSVE2-NEXT: asr x9, x9, #2 -; CHECK-NOSVE2-NEXT: cset w8, lt -; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NOSVE2-NEXT: mov z1.d, x9 -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8 -; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d -; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b -; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: ret entry: %b14 = ptrtoint ptr %b to i64 %c15 = ptrtoint ptr %c to i64 @@ -1371,18 +1292,6 @@ define @no_whilerw_8(ptr noalias %a, ptr %b, ptr %c, i32 %n) { ; CHECK-NEXT: whilelo p1.h, xzr, x8 ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: no_whilerw_8: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: subs x8, x1, x2 -; CHECK-NOSVE2-NEXT: cneg x8, x8, mi -; CHECK-NOSVE2-NEXT: cmp x8, #1 -; CHECK-NOSVE2-NEXT: cset w9, lt -; CHECK-NOSVE2-NEXT: whilelo p0.h, xzr, x8 -; CHECK-NOSVE2-NEXT: sbfx x8, x9, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p1.h, xzr, x8 -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: ret entry: %c14 = ptrtoint ptr %c to i64 %b15 = ptrtoint ptr %b to i64 @@ -1410,20 +1319,6 @@ define @no_whilerw_16(ptr noalias %a, ptr %b, ptr %c, i32 %n) ; CHECK-NEXT: whilelo p1.s, xzr, x8 ; CHECK-NEXT: mov p0.b, p1/m, p1.b ; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: no_whilerw_16: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: subs x8, x1, x2 -; CHECK-NOSVE2-NEXT: cneg x9, x8, mi -; CHECK-NOSVE2-NEXT: cmp x8, #1 -; CHECK-NOSVE2-NEXT: add x8, x9, x9, lsr #63 -; CHECK-NOSVE2-NEXT: cset w9, lt -; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 -; CHECK-NOSVE2-NEXT: asr x8, x8, #1 -; CHECK-NOSVE2-NEXT: whilelo p0.s, xzr, x9 -; CHECK-NOSVE2-NEXT: whilelo p1.s, xzr, x8 -; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b -; CHECK-NOSVE2-NEXT: ret entry: %b14 = ptrtoint ptr %b to i64 %c15 = ptrtoint ptr %c to i64 @@ -1454,22 +1349,6 @@ define @no_whilerw_32(ptr noalias %a, ptr %b, ptr %c, i32 %n) ; CHECK-NEXT: whilelo p1.d, xzr, x8 ; CHECK-NEXT: mov p0.b, p1/m, p1.b ; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: no_whilerw_32: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: mov x8, #6148914691236517205 // =0x5555555555555555 -; CHECK-NOSVE2-NEXT: subs x9, x1, x2 -; CHECK-NOSVE2-NEXT: movk x8, #21846 -; CHECK-NOSVE2-NEXT: cneg x10, x9, mi -; CHECK-NOSVE2-NEXT: cmp x9, #1 -; CHECK-NOSVE2-NEXT: smulh x8, x10, x8 -; CHECK-NOSVE2-NEXT: cset w9, lt -; CHECK-NOSVE2-NEXT: sbfx x9, x9, #0, #1 -; CHECK-NOSVE2-NEXT: whilelo p0.d, xzr, x9 -; CHECK-NOSVE2-NEXT: add x8, x8, x8, lsr #63 -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8 -; CHECK-NOSVE2-NEXT: mov p0.b, p1/m, p1.b -; CHECK-NOSVE2-NEXT: ret entry: %b14 = ptrtoint ptr %b to i64 %c15 = ptrtoint ptr %c to i64 @@ -1505,27 +1384,6 @@ define @no_whilerw_64(ptr noalias %a, ptr %b, ptr %c, i32 %n) ; CHECK-NEXT: punpklo p0.h, p0.b ; CHECK-NEXT: sel p0.b, p0, p0.b, p1.b ; CHECK-NEXT: ret -; -; CHECK-NOSVE2-LABEL: no_whilerw_64: -; CHECK-NOSVE2: // %bb.0: // %entry -; CHECK-NOSVE2-NEXT: subs x8, x1, x2 -; CHECK-NOSVE2-NEXT: index z0.d, #0, #1 -; CHECK-NOSVE2-NEXT: ptrue p0.d -; CHECK-NOSVE2-NEXT: cneg x9, x8, mi -; CHECK-NOSVE2-NEXT: add x10, x9, #3 -; CHECK-NOSVE2-NEXT: cmp x9, #0 -; CHECK-NOSVE2-NEXT: csel x9, x10, x9, lt -; CHECK-NOSVE2-NEXT: cmp x8, #1 -; CHECK-NOSVE2-NEXT: asr x9, x9, #2 -; CHECK-NOSVE2-NEXT: cset w8, lt -; CHECK-NOSVE2-NEXT: sbfx x8, x8, #0, #1 -; CHECK-NOSVE2-NEXT: mov z1.d, x9 -; CHECK-NOSVE2-NEXT: whilelo p1.d, xzr, x8 -; CHECK-NOSVE2-NEXT: cmphi p0.d, p0/z, z1.d, z0.d -; CHECK-NOSVE2-NEXT: punpklo p1.h, p1.b -; CHECK-NOSVE2-NEXT: punpklo p0.h, p0.b -; CHECK-NOSVE2-NEXT: sel p0.b, p0, p0.b, p1.b -; CHECK-NOSVE2-NEXT: ret entry: %b14 = ptrtoint ptr %b to i64 %c15 = ptrtoint ptr %c to i64