From 23595d1b9621067e5354924e2ca78e866a85690e Mon Sep 17 00:00:00 2001 From: adprasad-nvidia Date: Tue, 10 Sep 2024 10:57:07 +0100 Subject: [PATCH] [AArch64] Lower __builtin_bswap16 to rev16 if bswap followed by any_extend (#105375) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GCC compiles the built-in function `__builtin_bswap16`, to the ARM instruction rev16, which reverses the byte order of 16-bit data. On the other Clang compiles the same built-in function to e.g. ```     rev     w8, w0         lsr     w0, w8, #16 ``` i.e. it performs a byte reversal of a 32-bit register, (which moves the lower half, which contains the 16-bit data, to the upper half) and then right shifts the reversed 16-bit data back to the lower half of the register. We can improve Clang codegen by generating `rev16` instead of `rev` and `lsr`, like GCC. --- .../Target/AArch64/AArch64ISelLowering.cpp | 19 +++++ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 5 ++ llvm/test/CodeGen/AArch64/bswap.ll | 78 +++++++++++++++++-- 3 files changed, 97 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 433ea596f06e06..516d0cf33aaeb0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -22379,6 +22379,25 @@ static SDValue performExtendCombine(SDNode *N, N->getOperand(0)->getOpcode() == ISD::SETCC) return performSignExtendSetCCCombine(N, DCI, DAG); + // If we see (any_extend (bswap ...)) with bswap returning an i16, we know + // that the top half of the result register must be unused, due to the + // any_extend. This means that we can replace this pattern with (rev16 + // (any_extend ...)). This saves a machine instruction compared to (lsr (rev + // ...)), which is what this pattern would otherwise be lowered to. + // Only apply this optimisation if any_extend in original pattern to i32 or + // i64, because this type will become the input type to REV16 in the new + // pattern, so must be a legitimate REV16 input type. + SDValue Bswap = N->getOperand(0); + if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP && + Bswap.getValueType() == MVT::i16 && + (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) { + SDLoc DL(N); + SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), + Bswap->getOperand(0)); + return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0), + NewAnyExtend); + } + return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 7790cabd6db138..0ba69e8238efd4 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -758,6 +758,8 @@ def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>; def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>; def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>; +def AArch64rev16_scalar : SDNode<"AArch64ISD::REV16", SDTIntUnaryOp>; + def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>; def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>; def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>; @@ -2840,6 +2842,9 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>; def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>; def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>; +def : Pat<(AArch64rev16_scalar GPR32:$Rn), (REV16Wr GPR32:$Rn)>; +def : Pat<(AArch64rev16_scalar GPR64:$Rn), (REV16Xr GPR64:$Rn)>; + def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)), (and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))), (REV16Xr GPR64:$Rn)>; diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll index 9ee924dd2548a6..e90014be21deb3 100644 --- a/llvm/test/CodeGen/AArch64/bswap.ll +++ b/llvm/test/CodeGen/AArch64/bswap.ll @@ -3,17 +3,85 @@ ; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI ; ====== Scalar Tests ===== -define i16 @bswap_i16(i16 %a){ -; CHECK-LABEL: bswap_i16: + +; ====== Scalar bswap.i16 Tests ===== +define i16 @bswap_i16_to_i16_anyext(i16 %a){ +; CHECK-SD-LABEL: bswap_i16_to_i16_anyext: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: rev16 w0, w0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bswap_i16_to_i16_anyext: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: rev w8, w0 +; CHECK-GI-NEXT: lsr w0, w8, #16 +; CHECK-GI-NEXT: ret + %3 = call i16 @llvm.bswap.i16(i16 %a) + ret i16 %3 +} +declare i16 @llvm.bswap.i16(i16) + +; The zext here is optimised to an any_extend during isel. +define i64 @bswap_i16_to_i64_anyext(i16 %a) { +; CHECK-SD-LABEL: bswap_i16_to_i64_anyext: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0 +; CHECK-SD-NEXT: rev16 x8, x0 +; CHECK-SD-NEXT: lsl x0, x8, #48 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bswap_i16_to_i64_anyext: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: rev w8, w0 +; CHECK-GI-NEXT: lsr w8, w8, #16 +; CHECK-GI-NEXT: and x8, x8, #0xffff +; CHECK-GI-NEXT: lsl x0, x8, #48 +; CHECK-GI-NEXT: ret + %3 = call i16 @llvm.bswap.i16(i16 %a) + %4 = zext i16 %3 to i64 + %5 = shl i64 %4, 48 + ret i64 %5 +} + +; The zext here is optimised to an any_extend during isel.. +define i128 @bswap_i16_to_i128_anyext(i16 %a) { +; CHECK-SD-LABEL: bswap_i16_to_i128_anyext: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: mov w8, w0 +; CHECK-SD-NEXT: mov x0, xzr +; CHECK-SD-NEXT: rev w8, w8 +; CHECK-SD-NEXT: lsr w8, w8, #16 +; CHECK-SD-NEXT: lsl x1, x8, #48 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: bswap_i16_to_i128_anyext: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, w0 +; CHECK-GI-NEXT: mov x0, xzr +; CHECK-GI-NEXT: rev w8, w8 +; CHECK-GI-NEXT: lsr w8, w8, #16 +; CHECK-GI-NEXT: bfi x8, x8, #32, #32 +; CHECK-GI-NEXT: and x8, x8, #0xffff +; CHECK-GI-NEXT: lsl x1, x8, #48 +; CHECK-GI-NEXT: ret + %3 = call i16 @llvm.bswap.i16(i16 %a) + %4 = zext i16 %3 to i128 + %5 = shl i128 %4, 112 + ret i128 %5 +} + +define i32 @bswap_i16_to_i32_zext(i16 %a){ +; CHECK-LABEL: bswap_i16_to_i32_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: rev w8, w0 ; CHECK-NEXT: lsr w0, w8, #16 ; CHECK-NEXT: ret - %3 = call i16 @llvm.bswap.i16(i16 %a) - ret i16 %3 + %3 = call i16 @llvm.bswap.i16(i16 %a) + %4 = zext i16 %3 to i32 + ret i32 %4 } -declare i16 @llvm.bswap.i16(i16) +; ====== Other scalar bswap tests ===== define i32 @bswap_i32(i32 %a){ ; CHECK-LABEL: bswap_i32: ; CHECK: // %bb.0: