From 23595d1b9621067e5354924e2ca78e866a85690e Mon Sep 17 00:00:00 2001
From: adprasad-nvidia <adprasad@nvidia.com>
Date: Tue, 10 Sep 2024 10:57:07 +0100
Subject: [PATCH] [AArch64] Lower __builtin_bswap16 to rev16 if bswap followed
 by any_extend (#105375)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GCC compiles the built-in function `__builtin_bswap16`, to the ARM
instruction rev16, which reverses the byte order of 16-bit data. On the
other Clang compiles the same built-in function to e.g.
```    
        rev     w8, w0
        lsr     w0, w8, #16
```
i.e. it performs a byte reversal of a 32-bit register, (which moves the
lower half, which contains the 16-bit data, to the upper half) and then
right shifts the reversed 16-bit data back to the lower half of the
register.
We can improve Clang codegen by generating `rev16` instead of `rev` and
`lsr`, like GCC.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 19 +++++
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  5 ++
 llvm/test/CodeGen/AArch64/bswap.ll            | 78 +++++++++++++++++--
 3 files changed, 97 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 433ea596f06e06..516d0cf33aaeb0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -22379,6 +22379,25 @@ static SDValue performExtendCombine(SDNode *N,
       N->getOperand(0)->getOpcode() == ISD::SETCC)
     return performSignExtendSetCCCombine(N, DCI, DAG);
 
+  // If we see (any_extend (bswap ...)) with bswap returning an i16, we know
+  // that the top half of the result register must be unused, due to the
+  // any_extend. This means that we can replace this pattern with (rev16
+  // (any_extend ...)). This saves a machine instruction compared to (lsr (rev
+  // ...)), which is what this pattern would otherwise be lowered to.
+  // Only apply this optimisation if any_extend in original pattern to i32 or
+  // i64, because this type will become the input type to REV16 in the new
+  // pattern, so must be a legitimate REV16 input type.
+  SDValue Bswap = N->getOperand(0);
+  if (N->getOpcode() == ISD::ANY_EXTEND && Bswap.getOpcode() == ISD::BSWAP &&
+      Bswap.getValueType() == MVT::i16 &&
+      (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64)) {
+    SDLoc DL(N);
+    SDValue NewAnyExtend = DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0),
+                                       Bswap->getOperand(0));
+    return DAG.getNode(AArch64ISD::REV16, SDLoc(N), N->getValueType(0),
+                       NewAnyExtend);
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 7790cabd6db138..0ba69e8238efd4 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -758,6 +758,8 @@ def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
 def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
 def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
 
+def AArch64rev16_scalar : SDNode<"AArch64ISD::REV16", SDTIntUnaryOp>;
+
 def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
 def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
 def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
@@ -2840,6 +2842,9 @@ def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
 def : Pat<(srl (bswap top16Zero:$Rn), (i64 16)), (REV16Wr GPR32:$Rn)>;
 def : Pat<(srl (bswap top32Zero:$Rn), (i64 32)), (REV32Xr GPR64:$Rn)>;
 
+def : Pat<(AArch64rev16_scalar GPR32:$Rn), (REV16Wr GPR32:$Rn)>;
+def : Pat<(AArch64rev16_scalar GPR64:$Rn), (REV16Xr GPR64:$Rn)>;
+
 def : Pat<(or (and (srl GPR64:$Rn, (i64 8)), (i64 0x00ff00ff00ff00ff)),
               (and (shl GPR64:$Rn, (i64 8)), (i64 0xff00ff00ff00ff00))),
           (REV16Xr GPR64:$Rn)>;
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 9ee924dd2548a6..e90014be21deb3 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -3,17 +3,85 @@
 ; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; ====== Scalar Tests =====
-define i16 @bswap_i16(i16 %a){
-; CHECK-LABEL: bswap_i16:
+
+; ====== Scalar bswap.i16 Tests =====
+define i16 @bswap_i16_to_i16_anyext(i16 %a){
+; CHECK-SD-LABEL: bswap_i16_to_i16_anyext:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    rev16 w0, w0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bswap_i16_to_i16_anyext:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    rev w8, w0
+; CHECK-GI-NEXT:    lsr w0, w8, #16
+; CHECK-GI-NEXT:    ret
+    %3 = call i16 @llvm.bswap.i16(i16 %a)
+    ret i16 %3
+}
+declare i16 @llvm.bswap.i16(i16)
+
+; The zext here is optimised to an any_extend during isel.
+define i64 @bswap_i16_to_i64_anyext(i16 %a) {
+; CHECK-SD-LABEL: bswap_i16_to_i64_anyext:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-SD-NEXT:    rev16 x8, x0
+; CHECK-SD-NEXT:    lsl x0, x8, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bswap_i16_to_i64_anyext:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    rev w8, w0
+; CHECK-GI-NEXT:    lsr w8, w8, #16
+; CHECK-GI-NEXT:    and x8, x8, #0xffff
+; CHECK-GI-NEXT:    lsl x0, x8, #48
+; CHECK-GI-NEXT:    ret
+    %3 = call i16 @llvm.bswap.i16(i16 %a)
+    %4 = zext i16 %3 to i64
+    %5 = shl i64 %4, 48
+    ret i64 %5
+}
+
+; The zext here is optimised to an any_extend during isel..
+define i128 @bswap_i16_to_i128_anyext(i16 %a) {
+; CHECK-SD-LABEL: bswap_i16_to_i128_anyext:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    mov w8, w0
+; CHECK-SD-NEXT:    mov x0, xzr
+; CHECK-SD-NEXT:    rev w8, w8
+; CHECK-SD-NEXT:    lsr w8, w8, #16
+; CHECK-SD-NEXT:    lsl x1, x8, #48
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: bswap_i16_to_i128_anyext:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, w0
+; CHECK-GI-NEXT:    mov x0, xzr
+; CHECK-GI-NEXT:    rev w8, w8
+; CHECK-GI-NEXT:    lsr w8, w8, #16
+; CHECK-GI-NEXT:    bfi x8, x8, #32, #32
+; CHECK-GI-NEXT:    and x8, x8, #0xffff
+; CHECK-GI-NEXT:    lsl x1, x8, #48
+; CHECK-GI-NEXT:    ret
+    %3 = call i16 @llvm.bswap.i16(i16 %a)
+    %4 = zext i16 %3 to i128
+    %5 = shl i128 %4, 112
+    ret i128 %5
+}
+
+define i32 @bswap_i16_to_i32_zext(i16 %a){
+; CHECK-LABEL: bswap_i16_to_i32_zext:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev w8, w0
 ; CHECK-NEXT:    lsr w0, w8, #16
 ; CHECK-NEXT:    ret
-    %3 = call i16 @llvm.bswap.i16(i16 %a)
-    ret i16 %3
+  %3 = call i16 @llvm.bswap.i16(i16 %a)
+  %4 = zext i16 %3 to i32
+  ret i32 %4
 }
-declare i16 @llvm.bswap.i16(i16)
 
+; ====== Other scalar bswap tests =====
 define i32 @bswap_i32(i32 %a){
 ; CHECK-LABEL: bswap_i32:
 ; CHECK:       // %bb.0: