Skip to content

Commit

Permalink
[LoongArch] Optimize vector bitreverse using scalar bitrev and vshuf4i (
Browse files Browse the repository at this point in the history
#118054)

Custom lower vector type bitreverse to scalar bitrev and vshuf4i
instructions.

Keep `v2i64` and `v4i64` bitreverse `Expand`, it's good enough.
  • Loading branch information
zhaoqi5 authored Dec 10, 2024
1 parent 41ed16c commit 953838d
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 102 deletions.
51 changes: 51 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
{ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
Expand);
}
for (MVT VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
setOperationAction(ISD::BITREVERSE, VT, Custom);
for (MVT VT : {MVT::v8i16, MVT::v4i32, MVT::v2i64})
setOperationAction(ISD::BSWAP, VT, Legal);
for (MVT VT : {MVT::v4i32, MVT::v2i64}) {
Expand Down Expand Up @@ -324,6 +326,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
{ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
Expand);
}
for (MVT VT : {MVT::v32i8, MVT::v16i16, MVT::v8i32})
setOperationAction(ISD::BITREVERSE, VT, Custom);
for (MVT VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64})
setOperationAction(ISD::BSWAP, VT, Legal);
for (MVT VT : {MVT::v8i32, MVT::v4i32, MVT::v4i64}) {
Expand Down Expand Up @@ -440,10 +444,56 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
return lowerBUILD_VECTOR(Op, DAG);
case ISD::VECTOR_SHUFFLE:
return lowerVECTOR_SHUFFLE(Op, DAG);
case ISD::BITREVERSE:
return lowerBITREVERSE(Op, DAG);
}
return SDValue();
}

SDValue LoongArchTargetLowering::lowerBITREVERSE(SDValue Op,
SelectionDAG &DAG) const {
EVT ResTy = Op->getValueType(0);
SDValue Src = Op->getOperand(0);
SDLoc DL(Op);

EVT NewVT = ResTy.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
unsigned int OrigEltNum = ResTy.getVectorNumElements();
unsigned int NewEltNum = NewVT.getVectorNumElements();

SDValue NewSrc = DAG.getNode(ISD::BITCAST, DL, NewVT, Src);

SmallVector<SDValue, 8> Ops;
for (unsigned int i = 0; i < NewEltNum; i++) {
SDValue Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, NewSrc,
DAG.getConstant(i, DL, MVT::i64));
SDValue RevOp = DAG.getNode((ResTy == MVT::v16i8 || ResTy == MVT::v32i8)
? LoongArchISD::BITREV_8B
: ISD::BITREVERSE,
DL, MVT::i64, Op);
Ops.push_back(RevOp);
}
SDValue Res =
DAG.getNode(ISD::BITCAST, DL, ResTy, DAG.getBuildVector(NewVT, DL, Ops));

switch (ResTy.getSimpleVT().SimpleTy) {
default:
return SDValue();
case MVT::v16i8:
case MVT::v32i8:
return Res;
case MVT::v8i16:
case MVT::v16i16:
case MVT::v4i32:
case MVT::v8i32: {
SmallVector<int, 32> Mask;
for (unsigned int i = 0; i < NewEltNum; i++)
for (int j = OrigEltNum / NewEltNum - 1; j >= 0; j--)
Mask.push_back(j + (OrigEltNum / NewEltNum) * i);
return DAG.getVectorShuffle(ResTy, DL, Res, DAG.getUNDEF(ResTy), Mask);
}
}
}

/// Determine whether a range fits a regular pattern of values.
/// This function accounts for the possibility of jumping over the End iterator.
template <typename ValType>
Expand Down Expand Up @@ -4685,6 +4735,7 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(REVB_2H)
NODE_NAME_CASE(REVB_2W)
NODE_NAME_CASE(BITREV_4B)
NODE_NAME_CASE(BITREV_8B)
NODE_NAME_CASE(BITREV_W)
NODE_NAME_CASE(ROTR_W)
NODE_NAME_CASE(ROTL_W)
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ enum NodeType : unsigned {
REVB_2H,
REVB_2W,
BITREV_4B,
BITREV_8B,
BITREV_W,

// Intrinsic operations start ============================================
Expand Down Expand Up @@ -334,6 +335,7 @@ class LoongArchTargetLowering : public TargetLowering {
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerBITREVERSE(SDValue Op, SelectionDAG &DAG) const;

bool isFPImmLegal(const APFloat &Imm, EVT VT,
bool ForCodeSize) const override;
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ def loongarch_bstrpick
def loongarch_revb_2h : SDNode<"LoongArchISD::REVB_2H", SDTUnaryOp>;
def loongarch_revb_2w : SDNode<"LoongArchISD::REVB_2W", SDTUnaryOp>;
def loongarch_bitrev_4b : SDNode<"LoongArchISD::BITREV_4B", SDTUnaryOp>;
def loongarch_bitrev_8b : SDNode<"LoongArchISD::BITREV_8B", SDTUnaryOp>;
def loongarch_bitrev_w : SDNode<"LoongArchISD::BITREV_W", SDTUnaryOp>;
def loongarch_clzw : SDNode<"LoongArchISD::CLZ_W", SDTIntBitCountUnaryOp>;
def loongarch_ctzw : SDNode<"LoongArchISD::CTZ_W", SDTIntBitCountUnaryOp>;
Expand Down Expand Up @@ -1765,6 +1766,7 @@ def : Pat<(bitreverse (bswap GPR:$rj)), (BITREV_4B GPR:$rj)>;
let Predicates = [IsLA64] in {
def : Pat<(loongarch_revb_2w GPR:$rj), (REVB_2W GPR:$rj)>;
def : Pat<(bswap GPR:$rj), (REVB_D GPR:$rj)>;
def : Pat<(loongarch_bitrev_8b GPR:$rj), (BITREV_8B GPR:$rj)>;
def : Pat<(loongarch_bitrev_w GPR:$rj), (BITREV_W GPR:$rj)>;
def : Pat<(bitreverse GPR:$rj), (BITREV_D GPR:$rj)>;
def : Pat<(bswap (bitreverse GPR:$rj)), (BITREV_8B GPR:$rj)>;
Expand Down
90 changes: 39 additions & 51 deletions llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,19 @@ declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>)
define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
; CHECK-LABEL: test_bitreverse_v32i8:
; CHECK: # %bb.0:
; CHECK-NEXT: xvslli.b $xr1, $xr0, 4
; CHECK-NEXT: xvsrli.b $xr0, $xr0, 4
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
; CHECK-NEXT: xvandi.b $xr1, $xr0, 51
; CHECK-NEXT: xvslli.b $xr1, $xr1, 2
; CHECK-NEXT: xvsrli.b $xr0, $xr0, 2
; CHECK-NEXT: xvandi.b $xr0, $xr0, 51
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
; CHECK-NEXT: xvandi.b $xr1, $xr0, 85
; CHECK-NEXT: xvslli.b $xr1, $xr1, 1
; CHECK-NEXT: xvsrli.b $xr0, $xr0, 1
; CHECK-NEXT: xvandi.b $xr0, $xr0, 85
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
; CHECK-NEXT: bitrev.8b $a0, $a0
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
; CHECK-NEXT: bitrev.8b $a0, $a0
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
; CHECK-NEXT: bitrev.8b $a0, $a0
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
; CHECK-NEXT: bitrev.8b $a0, $a0
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
; CHECK-NEXT: ret
%b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
ret <32 x i8> %b
Expand All @@ -30,25 +30,19 @@ declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>)
define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
; CHECK-LABEL: test_bitreverse_v16i16:
; CHECK: # %bb.0:
; CHECK-NEXT: xvshuf4i.b $xr0, $xr0, 177
; CHECK-NEXT: xvsrli.h $xr1, $xr0, 4
; CHECK-NEXT: xvrepli.b $xr2, 15
; CHECK-NEXT: xvand.v $xr1, $xr1, $xr2
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2
; CHECK-NEXT: xvslli.h $xr0, $xr0, 4
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
; CHECK-NEXT: xvsrli.h $xr1, $xr0, 2
; CHECK-NEXT: xvrepli.b $xr2, 51
; CHECK-NEXT: xvand.v $xr1, $xr1, $xr2
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2
; CHECK-NEXT: xvslli.h $xr0, $xr0, 2
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
; CHECK-NEXT: xvsrli.h $xr1, $xr0, 1
; CHECK-NEXT: xvrepli.b $xr2, 85
; CHECK-NEXT: xvand.v $xr1, $xr1, $xr2
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2
; CHECK-NEXT: xvslli.h $xr0, $xr0, 1
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
; CHECK-NEXT: bitrev.d $a0, $a0
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
; CHECK-NEXT: bitrev.d $a0, $a0
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
; CHECK-NEXT: bitrev.d $a0, $a0
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
; CHECK-NEXT: bitrev.d $a0, $a0
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
; CHECK-NEXT: xvshuf4i.h $xr0, $xr1, 27
; CHECK-NEXT: ret
%b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
ret <16 x i16> %b
Expand All @@ -59,25 +53,19 @@ declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>)
define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
; CHECK-LABEL: test_bitreverse_v8i32:
; CHECK: # %bb.0:
; CHECK-NEXT: xvshuf4i.b $xr0, $xr0, 27
; CHECK-NEXT: xvsrli.w $xr1, $xr0, 4
; CHECK-NEXT: xvrepli.b $xr2, 15
; CHECK-NEXT: xvand.v $xr1, $xr1, $xr2
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2
; CHECK-NEXT: xvslli.w $xr0, $xr0, 4
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
; CHECK-NEXT: xvsrli.w $xr1, $xr0, 2
; CHECK-NEXT: xvrepli.b $xr2, 51
; CHECK-NEXT: xvand.v $xr1, $xr1, $xr2
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2
; CHECK-NEXT: xvslli.w $xr0, $xr0, 2
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
; CHECK-NEXT: xvsrli.w $xr1, $xr0, 1
; CHECK-NEXT: xvrepli.b $xr2, 85
; CHECK-NEXT: xvand.v $xr1, $xr1, $xr2
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr2
; CHECK-NEXT: xvslli.w $xr0, $xr0, 1
; CHECK-NEXT: xvor.v $xr0, $xr1, $xr0
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
; CHECK-NEXT: bitrev.d $a0, $a0
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
; CHECK-NEXT: bitrev.d $a0, $a0
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
; CHECK-NEXT: bitrev.d $a0, $a0
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
; CHECK-NEXT: bitrev.d $a0, $a0
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
; CHECK-NEXT: xvshuf4i.w $xr0, $xr1, 177
; CHECK-NEXT: ret
%b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
ret <8 x i32> %b
Expand Down
72 changes: 21 additions & 51 deletions llvm/test/CodeGen/LoongArch/lsx/bitreverse.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,13 @@ declare <16 x i8> @llvm.bitreverse.v16i8(<16 x i8>)
define <16 x i8> @test_bitreverse_v16i8(<16 x i8> %a) nounwind {
; CHECK-LABEL: test_bitreverse_v16i8:
; CHECK: # %bb.0:
; CHECK-NEXT: vslli.b $vr1, $vr0, 4
; CHECK-NEXT: vsrli.b $vr0, $vr0, 4
; CHECK-NEXT: vor.v $vr0, $vr0, $vr1
; CHECK-NEXT: vandi.b $vr1, $vr0, 51
; CHECK-NEXT: vslli.b $vr1, $vr1, 2
; CHECK-NEXT: vsrli.b $vr0, $vr0, 2
; CHECK-NEXT: vandi.b $vr0, $vr0, 51
; CHECK-NEXT: vor.v $vr0, $vr0, $vr1
; CHECK-NEXT: vandi.b $vr1, $vr0, 85
; CHECK-NEXT: vslli.b $vr1, $vr1, 1
; CHECK-NEXT: vsrli.b $vr0, $vr0, 1
; CHECK-NEXT: vandi.b $vr0, $vr0, 85
; CHECK-NEXT: vor.v $vr0, $vr0, $vr1
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0
; CHECK-NEXT: bitrev.8b $a0, $a0
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
; CHECK-NEXT: bitrev.8b $a0, $a0
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
; CHECK-NEXT: vori.b $vr0, $vr1, 0
; CHECK-NEXT: ret
%b = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> %a)
ret <16 x i8> %b
Expand All @@ -30,25 +24,13 @@ declare <8 x i16> @llvm.bitreverse.v8i16(<8 x i16>)
define <8 x i16> @test_bitreverse_v8i16(<8 x i16> %a) nounwind {
; CHECK-LABEL: test_bitreverse_v8i16:
; CHECK: # %bb.0:
; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 177
; CHECK-NEXT: vsrli.h $vr1, $vr0, 4
; CHECK-NEXT: vrepli.b $vr2, 15
; CHECK-NEXT: vand.v $vr1, $vr1, $vr2
; CHECK-NEXT: vand.v $vr0, $vr0, $vr2
; CHECK-NEXT: vslli.h $vr0, $vr0, 4
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
; CHECK-NEXT: vsrli.h $vr1, $vr0, 2
; CHECK-NEXT: vrepli.b $vr2, 51
; CHECK-NEXT: vand.v $vr1, $vr1, $vr2
; CHECK-NEXT: vand.v $vr0, $vr0, $vr2
; CHECK-NEXT: vslli.h $vr0, $vr0, 2
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
; CHECK-NEXT: vsrli.h $vr1, $vr0, 1
; CHECK-NEXT: vrepli.b $vr2, 85
; CHECK-NEXT: vand.v $vr1, $vr1, $vr2
; CHECK-NEXT: vand.v $vr0, $vr0, $vr2
; CHECK-NEXT: vslli.h $vr0, $vr0, 1
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0
; CHECK-NEXT: bitrev.d $a0, $a0
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
; CHECK-NEXT: bitrev.d $a0, $a0
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
; CHECK-NEXT: vshuf4i.h $vr0, $vr1, 27
; CHECK-NEXT: ret
%b = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> %a)
ret <8 x i16> %b
Expand All @@ -59,25 +41,13 @@ declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>)
define <4 x i32> @test_bitreverse_v4i32(<4 x i32> %a) nounwind {
; CHECK-LABEL: test_bitreverse_v4i32:
; CHECK: # %bb.0:
; CHECK-NEXT: vshuf4i.b $vr0, $vr0, 27
; CHECK-NEXT: vsrli.w $vr1, $vr0, 4
; CHECK-NEXT: vrepli.b $vr2, 15
; CHECK-NEXT: vand.v $vr1, $vr1, $vr2
; CHECK-NEXT: vand.v $vr0, $vr0, $vr2
; CHECK-NEXT: vslli.w $vr0, $vr0, 4
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
; CHECK-NEXT: vsrli.w $vr1, $vr0, 2
; CHECK-NEXT: vrepli.b $vr2, 51
; CHECK-NEXT: vand.v $vr1, $vr1, $vr2
; CHECK-NEXT: vand.v $vr0, $vr0, $vr2
; CHECK-NEXT: vslli.w $vr0, $vr0, 2
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
; CHECK-NEXT: vsrli.w $vr1, $vr0, 1
; CHECK-NEXT: vrepli.b $vr2, 85
; CHECK-NEXT: vand.v $vr1, $vr1, $vr2
; CHECK-NEXT: vand.v $vr0, $vr0, $vr2
; CHECK-NEXT: vslli.w $vr0, $vr0, 1
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 0
; CHECK-NEXT: bitrev.d $a0, $a0
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
; CHECK-NEXT: vpickve2gr.d $a0, $vr0, 1
; CHECK-NEXT: bitrev.d $a0, $a0
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
; CHECK-NEXT: vshuf4i.w $vr0, $vr1, 177
; CHECK-NEXT: ret
%b = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> %a)
ret <4 x i32> %b
Expand Down

0 comments on commit 953838d

Please sign in to comment.