Skip to content

Commit

Permalink
[Clang][LLVM][AArch64] Add intrinsic for LUTI4 SME2 instruction (#97755
Browse files Browse the repository at this point in the history
…) (#109953)

This patch was reverted because of a failing C test.
It now has being solved and can be merged into main again 

This patch adds these intrinsics:

// Variants are also available for: _s8
svuint8x4_t svluti4_zt_u8_x4(uint64_t zt0, svuint8x2_t zn)
__arm_streaming __arm_in("zt0");

according to PR#324[1]
[1]ARM-software/acle#324
  • Loading branch information
CarolineConcatto authored Sep 30, 2024
1 parent 587eaef commit c0e97c4
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 10 deletions.
5 changes: 5 additions & 0 deletions clang/include/clang/Basic/arm_sme.td
Original file line number Diff line number Diff line change
Expand Up @@ -817,4 +817,9 @@ multiclass ZAReadzArray<string vg_num>{

defm SVREADZ_VG2 : ZAReadzArray<"2">;
defm SVREADZ_VG4 : ZAReadzArray<"4">;

let SMETargetGuard = "sme2,sme-lutv2" in {
def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
}

} // let SVETargetGuard = InvalidMode
42 changes: 42 additions & 0 deletions clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_zt.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5

// REQUIRES: aarch64-registered-target

// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +sme-lutv2 -disable-O0-optnone -Werror -Wall -o /dev/null %s


#include <arm_sme.h>

// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_luti4_zt_u8_x4(
// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[OP_COERCE0]], <vscale x 16 x i8> [[OP_COERCE1]])
// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z19test_luti4_zt_u8_x411svuint8x2_t(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[OP_COERCE0]], <vscale x 16 x i8> [[OP_COERCE1]])
// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
svuint8x4_t test_luti4_zt_u8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") {
return svluti4_zt_u8_x4(0, op);
}

// CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @test_luti4_zt_s8_x4(
// CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[OP_COERCE0]], <vscale x 16 x i8> [[OP_COERCE1]])
// CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @_Z19test_luti4_zt_s8_x411svuint8x2_t(
// CPP-CHECK-SAME: <vscale x 16 x i8> [[OP_COERCE0:%.*]], <vscale x 16 x i8> [[OP_COERCE1:%.*]]) #[[ATTR0]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> [[OP_COERCE0]], <vscale x 16 x i8> [[OP_COERCE1]])
// CPP-CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[TMP0]]
//
svint8x4_t test_luti4_zt_s8_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") {
return svluti4_zt_s8_x4(0, op);
}
5 changes: 5 additions & 0 deletions clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -350,3 +350,8 @@ void test_svdot_multi_za32_bad_lane(uint32_t slice_base, svuint16_t z_u16,
svsudot_lane_za32_s8_vg1x2(slice_base, z_s8x2, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
svsudot_lane_za32_s8_vg1x4(slice_base, z_s8x4, z_u8, 4); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
}

void test_luti4_zt_x4(svuint8x2_t op) __arm_streaming __arm_in("zt0") {
// Check Zt tile 0
svluti4_zt_u8_x4(1, op); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
}
7 changes: 7 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -3769,6 +3769,12 @@ let TargetPrefix = "aarch64" in {
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_i32_ty],
[ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, IntrReadMem]>;

def int_aarch64_sme_luti4_zt_x4
: DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
[llvm_i32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
[ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects]>;


//
// Register scaling
Expand All @@ -3794,6 +3800,7 @@ let TargetPrefix = "aarch64" in {
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>,
LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
[IntrNoMem]>;

}

// SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2
Expand Down
54 changes: 45 additions & 9 deletions llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,8 +400,10 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
}

void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc,
uint32_t MaxImm);
void SelectMultiVectorLutiLane(SDNode *Node, unsigned NumOutVecs,
unsigned Opc, uint32_t MaxImm);

void SelectMultiVectorLuti(SDNode *Node, unsigned NumOutVecs, unsigned Opc);

template <unsigned MaxIdx, unsigned Scale>
bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) {
Expand Down Expand Up @@ -1975,16 +1977,18 @@ void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs,
SelectUnaryMultiIntrinsic(N, NumVecs, true, Opcode);
}

void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
unsigned NumOutVecs,
unsigned Opc, uint32_t MaxImm) {
void AArch64DAGToDAGISel::SelectMultiVectorLutiLane(SDNode *Node,
unsigned NumOutVecs,
unsigned Opc,
uint32_t MaxImm) {
if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Node->getOperand(4)))
if (Imm->getZExtValue() > MaxImm)
return;

SDValue ZtValue;
if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
return;

SDValue Ops[] = {ZtValue, Node->getOperand(3), Node->getOperand(4)};
SDLoc DL(Node);
EVT VT = Node->getValueType(0);
Expand All @@ -2003,6 +2007,34 @@ void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
CurDAG->RemoveDeadNode(Node);
}

void AArch64DAGToDAGISel::SelectMultiVectorLuti(SDNode *Node,
unsigned NumOutVecs,
unsigned Opc) {

SDValue ZtValue;
SmallVector<SDValue, 4> Ops;
if (!ImmToReg<AArch64::ZT0, 0>(Node->getOperand(2), ZtValue))
return;

Ops.push_back(ZtValue);
Ops.push_back(createZMulTuple({Node->getOperand(3), Node->getOperand(4)}));
SDLoc DL(Node);
EVT VT = Node->getValueType(0);

SDNode *Instruction =
CurDAG->getMachineNode(Opc, DL, {MVT::Untyped, MVT::Other}, Ops);
SDValue SuperReg = SDValue(Instruction, 0);

for (unsigned I = 0; I < NumOutVecs; ++I)
ReplaceUses(SDValue(Node, I), CurDAG->getTargetExtractSubreg(
AArch64::zsub0 + I, DL, VT, SuperReg));

// Copy chain
unsigned ChainIdx = NumOutVecs;
ReplaceUses(SDValue(Node, ChainIdx), SDValue(Instruction, 1));
CurDAG->RemoveDeadNode(Node);
}

void AArch64DAGToDAGISel::SelectClamp(SDNode *N, unsigned NumVecs,
unsigned Op) {
SDLoc DL(N);
Expand Down Expand Up @@ -5478,15 +5510,15 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
{AArch64::LUTI2_4ZTZI_B, AArch64::LUTI2_4ZTZI_H,
AArch64::LUTI2_4ZTZI_S}))
// Second Immediate must be <= 3:
SelectMultiVectorLuti(Node, 4, Opc, 3);
SelectMultiVectorLutiLane(Node, 4, Opc, 3);
return;
}
case Intrinsic::aarch64_sme_luti4_lane_zt_x4: {
if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::AnyType>(
Node->getValueType(0),
{0, AArch64::LUTI4_4ZTZI_H, AArch64::LUTI4_4ZTZI_S}))
// Second Immediate must be <= 1:
SelectMultiVectorLuti(Node, 4, Opc, 1);
SelectMultiVectorLutiLane(Node, 4, Opc, 1);
return;
}
case Intrinsic::aarch64_sme_luti2_lane_zt_x2: {
Expand All @@ -5495,7 +5527,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
{AArch64::LUTI2_2ZTZI_B, AArch64::LUTI2_2ZTZI_H,
AArch64::LUTI2_2ZTZI_S}))
// Second Immediate must be <= 7:
SelectMultiVectorLuti(Node, 2, Opc, 7);
SelectMultiVectorLutiLane(Node, 2, Opc, 7);
return;
}
case Intrinsic::aarch64_sme_luti4_lane_zt_x2: {
Expand All @@ -5504,7 +5536,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
{AArch64::LUTI4_2ZTZI_B, AArch64::LUTI4_2ZTZI_H,
AArch64::LUTI4_2ZTZI_S}))
// Second Immediate must be <= 3:
SelectMultiVectorLuti(Node, 2, Opc, 3);
SelectMultiVectorLutiLane(Node, 2, Opc, 3);
return;
}
case Intrinsic::aarch64_sme_luti4_zt_x4: {
SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z);
return;
}
}
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -940,7 +940,7 @@ defm FAMIN_4Z4Z : sme2_fp_sve_destructive_vector_vg4_multi<"famin", 0b0010101>;

let Predicates = [HasSME2, HasSME_LUTv2] in {
defm MOVT : sme2_movt_zt_to_zt<"movt", 0b0011111>;
def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
def LUTI4_4ZZT2Z : sme2_luti4_vector_vg4<0b00, 0b00,"luti4">;
} //[HasSME2, HasSME_LUTv2]

let Predicates = [HasSME2p1, HasSME_LUTv2] in {
Expand Down
17 changes: 17 additions & 0 deletions llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -verify-machineinstrs -force-streaming < %s | FileCheck %s

target triple = "aarch64-linux"

define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @test_luti4_zt_i8(<vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) #0 {
; CHECK-LABEL: test_luti4_zt_i8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
; CHECK-NEXT: luti4 { z0.b - z3.b }, zt0, { z0, z1 }
; CHECK-NEXT: ret
%res = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.aarch64.sme.luti4.zt.x4.nxv16i8(i32 0, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1)
ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %res
}

attributes #0 = { "target-features"="+sme2,+sme-lutv2"}

0 comments on commit c0e97c4

Please sign in to comment.