diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td index ae6b55e98827ff..9c9f31f3884069 100644 --- a/clang/include/clang/Basic/arm_sme.td +++ b/clang/include/clang/Basic/arm_sme.td @@ -817,4 +817,9 @@ multiclass ZAReadzArray{ defm SVREADZ_VG2 : ZAReadzArray<"2">; defm SVREADZ_VG4 : ZAReadzArray<"4">; + +let SMETargetGuard = "sme2,sme-lutv2" in { + def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>; +} + } // let SVETargetGuard = InvalidMode diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c new file mode 100644 index 00000000000000..a2f87aed3187cc --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c @@ -0,0 +1,42 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 + +// REQUIRES: aarch64-registered-target + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -o /dev/null %s + + +#include + +// CHECK-LABEL: define dso_local { , , , } @test_luti4_zt_u8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, [[OP_COERCE0]], [[OP_COERCE1]]) +// CHECK-NEXT: ret { , , , } [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local { , , , } @_Z19test_luti4_zt_u8_x411svuint8x2_t( +// CPP-CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, [[OP_COERCE0]], [[OP_COERCE1]]) +// CPP-CHECK-NEXT: ret { , , , } [[TMP0]] +// +svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") { + return svluti4_zt_u8_x4(0, op); +} + +// CHECK-LABEL: define dso_local { , , , } @test_luti4_zt_s8_x4( +// CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, [[OP_COERCE0]], [[OP_COERCE1]]) +// CHECK-NEXT: ret { , , , } [[TMP0]] +// +// CPP-CHECK-LABEL: define dso_local { , , , } @_Z19test_luti4_zt_s8_x411svuint8x2_t( +// CPP-CHECK-SAME: [[OP_COERCE0:%.*]], [[OP_COERCE1:%.*]]) #[[ATTR0]] { +// CPP-CHECK-NEXT: [[ENTRY:.*:]] +// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { , , , } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, [[OP_COERCE0]], [[OP_COERCE1]]) +// CPP-CHECK-NEXT: ret { , , , } [[TMP0]] +// +svint8x4_t test_luti4_zt_s8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") { + return svluti4_zt_s8_x4(0, op); +} diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp index 5de97649af5d3a..d9bb6daf974d5b 100644 --- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp +++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp @@ -350,3 +350,8 @@ void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16, svsudot_lane_za32_s8_vg1x2(slice_base, z_s8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} svsudot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}} } + +void test_luti4_zt_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") { + // Check Zt tile 0 + svluti4_zt_u8_x4(1, op); // expected-error {{argument value 1 is outside the valid range [0, 0]}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 2d8ce66f53ba8a..eda2f69dd230cc 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -3769,6 +3769,12 @@ let TargetPrefix = "aarch64" in { : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty], [ImmArg>, ImmArg>, IntrReadMem]>; + + def int_aarch64_sme_luti4_zt_x4 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty], + [ImmArg>, IntrNoMem, IntrHasSideEffects]>; + // // Register scaling @@ -3794,6 +3800,7 @@ let TargetPrefix = "aarch64" in { [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>], [IntrNoMem]>; + } // SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2 diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index dfb6b08b1f73b2..6133580a3cd771 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -400,8 +400,10 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { return SelectSVERegRegAddrMode(N, Scale, Base, Offset); } - void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc, - uint32_t MaxImm); + void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs, + unsigned Opc, uint32_t MaxImm); + + void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc); template bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) { @@ -1975,9 +1977,10 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs, SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode); } -void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, - unsigned NumOutVecs, - unsigned Opc, uint32_t MaxImm) { +void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node, + unsigned NumOutVecs, + unsigned Opc, + uint32_t MaxImm) { if (ConstantSDNode *Imm = dyn_cast(Node->getOperand(4))) if (Imm->getZExtValue() > MaxImm) return; @@ -1985,6 +1988,7 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, SDValue ZtValue; if (!ImmToReg(Node->getOperand(2), ZtValue)) return; + SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)}; SDLoc DL(Node); EVT VT = Node->getValueType(0); @@ -2003,6 +2007,34 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, CurDAG->RemoveDeadNode(Node); } +void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node, + unsigned NumOutVecs, + unsigned Opc) { + + SDValue ZtValue; + SmallVector Ops; + if (!ImmToReg(Node->getOperand(2), ZtValue)) + return; + + Ops.push_back(ZtValue); + Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)})); + SDLoc DL(Node); + EVT VT = Node->getValueType(0); + + SDNode *Instruction = + CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops); + SDValue SuperReg = SDValue(Instruction, 0); + + for (unsigned I = 0; I < NumOutVecs; ++I) + ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + I, DL, VT, SuperReg)); + + // Copy chain + unsigned ChainIdx = NumOutVecs; + ReplaceUses(SDValue(Node, ChainIdx), SDValue(Instruction, 1)); + CurDAG->RemoveDeadNode(Node); +} + void AArch64DAGToDAGISel::SelectClamp(SDNode *N, unsigned NumVecs, unsigned Op) { SDLoc DL(N); @@ -5478,7 +5510,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { {AArch64::LUTI2_4ZTZI_B, AArch64::LUTI2_4ZTZI_H, AArch64::LUTI2_4ZTZI_S})) // Second Immediate must be <= 3: - SelectMultiVectorLuti(Node, 4, Opc, 3); + SelectMultiVectorLutiLane(Node, 4, Opc, 3); return; } case Intrinsic::aarch64_sme_luti4_lane_zt_x4: { @@ -5486,7 +5518,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { Node->getValueType(0), {0, AArch64::LUTI4_4ZTZI_H, AArch64::LUTI4_4ZTZI_S})) // Second Immediate must be <= 1: - SelectMultiVectorLuti(Node, 4, Opc, 1); + SelectMultiVectorLutiLane(Node, 4, Opc, 1); return; } case Intrinsic::aarch64_sme_luti2_lane_zt_x2: { @@ -5495,7 +5527,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { {AArch64::LUTI2_2ZTZI_B, AArch64::LUTI2_2ZTZI_H, AArch64::LUTI2_2ZTZI_S})) // Second Immediate must be <= 7: - SelectMultiVectorLuti(Node, 2, Opc, 7); + SelectMultiVectorLutiLane(Node, 2, Opc, 7); return; } case Intrinsic::aarch64_sme_luti4_lane_zt_x2: { @@ -5504,7 +5536,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { {AArch64::LUTI4_2ZTZI_B, AArch64::LUTI4_2ZTZI_H, AArch64::LUTI4_2ZTZI_S})) // Second Immediate must be <= 3: - SelectMultiVectorLuti(Node, 2, Opc, 3); + SelectMultiVectorLutiLane(Node, 2, Opc, 3); + return; + } + case Intrinsic::aarch64_sme_luti4_zt_x4: { + SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z); return; } } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index ebe4121c944b1e..e2261694d658c5 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -940,7 +940,7 @@ defm FAMIN_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famin", 0b0010101>; let Predicates = [HasSME2, HasSME_LUTv2] in { defm MOVT : sme2_movt_zt_to_zt<"movt", 0b0011111>; -def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">; +def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">; } //[HasSME2, HasSME_LUTv2] let Predicates = [HasSME2p1, HasSME_LUTv2] in { diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll new file mode 100644 index 00000000000000..778f31194baf45 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s + +target triple = "aarch64-linux" + +define {, , , } @test_luti4_zt_i8( %v0, %v1) #0 { +; CHECK-LABEL: test_luti4_zt_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: luti4 { z0.b - z3.b }, zt0, { z0, z1 } +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, %v0, %v1) + ret {, , , } %res +} + +attributes #0 = { "target-features"="+sme2,+sme-lutv2"}