[SystemZ] Support fp16 vector ABI and basic codegen.#171066
[SystemZ] Support fp16 vector ABI and basic codegen.#171066
Conversation
|
@llvm/pr-subscribers-backend-systemz Author: Jonas Paulsson (JonPsson1) Changes
This seems to be the better handling, at least if narrow (e.g. 4 element) vectors are considered and not supposed to get all 8 ops emitted. Patch in progress, with some tests in place that are passing. There are several opportunities for optimizing the results, but not sure if or what would be relevant at this point, with the conversion routines having a heavy overhead. Fixes #168992 Patch is 135.60 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/171066.diff 12 Files Affected:
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 2795de5eeeb66..69202e3fcbc57 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -50,7 +50,7 @@ def RetCC_SystemZ_ELF : CallingConv<[
// Sub-128 vectors are returned in the same way, but they're widened
// to one of these types during type legalization.
CCIfSubtarget<"hasVector()",
- CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>>
]>;
@@ -116,19 +116,19 @@ def CC_SystemZ_ELF : CallingConv<[
// are passed in the same way, but they're widened to one of these types
// during type legalization.
CCIfSubtarget<"hasVector()",
- CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
CCIfArgFixed<CCAssignToReg<[V24, V26, V28, V30,
V25, V27, V29, V31]>>>>,
// However, sub-128 vectors which need to go on the stack occupy just a
// single 8-byte-aligned 8-byte stack slot. Pass as i64.
CCIfSubtarget<"hasVector()",
- CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
CCIfShortVector<CCBitConvertToType<i64>>>>,
// Other vector arguments are passed in 8-byte-aligned 16-byte stack slots.
CCIfSubtarget<"hasVector()",
- CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
CCAssignToStack<16, 8>>>,
// Other arguments are passed in 8-byte-aligned 8-byte stack slots.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 2511d08a6d0ef..aaa6c22eaf01a 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -123,6 +123,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
+ addRegisterClass(MVT::v8f16, &SystemZ::VR128BitRegClass);
addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
}
@@ -620,6 +621,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// Handle floating-point vector types.
if (Subtarget.hasVector()) {
// Scalar-to-vector conversion is just a subreg.
+ setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal);
@@ -627,6 +629,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// need to go via integers.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
@@ -842,6 +845,33 @@ bool SystemZTargetLowering::useSoftFloat() const {
return Subtarget.hasSoftFloat();
}
+unsigned SystemZTargetLowering::getNumRegisters(LLVMContext &Context, EVT VT,
+ std::optional<MVT> RegisterVT) const {
+ // i128 inline assembly operand.
+ if (VT == MVT::i128 && RegisterVT && *RegisterVT == MVT::Untyped)
+ return 1;
+ // Pass narrow fp16 vectors per the ABI even though they are generally
+ // expanded.
+ if (Subtarget.hasVector() && VT.isVector() && VT.getScalarType() == MVT::f16)
+ return divideCeil(VT.getVectorNumElements(), SystemZ::VectorBytes / 2);
+ return TargetLowering::getNumRegisters(Context, VT);
+}
+
+MVT SystemZTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC,
+ EVT VT) const {
+ // 128-bit single-element vector types are passed like other vectors,
+ // not like their element type.
+ if (VT.isVector() && VT.getSizeInBits() == 128 &&
+ VT.getVectorNumElements() == 1)
+ return MVT::v16i8;
+ // Pass narrow fp16 vectors per the ABI even though they are generally
+ // expanded.
+ if (Subtarget.hasVector() && VT.isVector() && VT.getScalarType() == MVT::f16)
+ return MVT::v8f16;
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+}
+
EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
LLVMContext &, EVT VT) const {
if (!VT.isVector())
@@ -2051,6 +2081,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
case MVT::v8i16:
case MVT::v4i32:
case MVT::v2i64:
+ case MVT::v8f16:
case MVT::v4f32:
case MVT::v2f64:
RC = &SystemZ::VR128BitRegClass;
@@ -6351,6 +6382,37 @@ bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const {
return false;
}
+static SDValue mergeHighParts(SelectionDAG &DAG, const SDLoc &DL,
+ unsigned MergedBits, EVT VT, SDValue Op0,
+ SDValue Op1) {
+ MVT IntVecVT = MVT::getVectorVT(MVT::getIntegerVT(MergedBits),
+ SystemZ::VectorBits / MergedBits);
+ assert(VT.getSizeInBits() == 128 && IntVecVT.getSizeInBits() == 128 &&
+ "Handling full vectors only.");
+ Op0 = DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0);
+ Op1 = DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op1);
+ SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
+ DL, IntVecVT, Op0, Op1);
+ return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+}
+
+static SDValue buildFPVecFromScalars4(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
+ SmallVectorImpl<SDValue> &Elems,
+ unsigned Pos) {
+ SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 0], Elems[Pos + 1]);
+ SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 2], Elems[Pos + 3]);
+ // Avoid unnecessary undefs by reusing the other operand.
+ if (Op01.isUndef())
+ Op01 = Op23;
+ else if (Op23.isUndef())
+ Op23 = Op01;
+ // Merging identical replications is a no-op.
+ if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
+ return Op01;
+ unsigned MergedBits = VT.getSimpleVT().getScalarSizeInBits() * 2;
+ return mergeHighParts(DAG, DL, MergedBits, VT, Op01, Op23);
+}
+
// Combine GPR scalar values Elems into a vector of type VT.
SDValue
SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
@@ -6409,22 +6471,17 @@ SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
// <ABxx> <CDxx>
// V VMRHG
// <ABCD>
- if (VT == MVT::v4f32 && !AllLoads) {
- SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
- SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
- // Avoid unnecessary undefs by reusing the other operand.
- if (Op01.isUndef())
- Op01 = Op23;
- else if (Op23.isUndef())
- Op23 = Op01;
+ if (VT == MVT::v4f32 && !AllLoads)
+ return buildFPVecFromScalars4(DAG, DL, VT, Elems, 0);
+
+ // Same for v8i16.
+ if (VT == MVT::v8f16 && !AllLoads) {
+ SDValue Op0123 = buildFPVecFromScalars4(DAG, DL, VT, Elems, 0);
+ SDValue Op4567 = buildFPVecFromScalars4(DAG, DL, VT, Elems, 4);
// Merging identical replications is a no-op.
- if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23)
- return Op01;
- Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01);
- Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23);
- SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH,
- DL, MVT::v2i64, Op01, Op23);
- return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+ if (Op0123.getOpcode() == SystemZISD::REPLICATE && Op0123 == Op4567)
+ return Op0123;
+ return mergeHighParts(DAG, DL, 64, VT, Op0123, Op4567);
}
// Collect the constant terms.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 13a1cd1614a53..ca47b96ef2d80 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -64,27 +64,19 @@ class SystemZTargetLowering : public TargetLowering {
//
// (c) there are no multiplication instructions for the widest integer
// type (v2i64).
+
+ // Expand (narrow) f16 vectors during type legalization to avoid
+ // operations for all elements as with expansion after widening.
+ if (VT.getScalarType() == MVT::f16)
+ return VT.getVectorElementCount().isScalar() ? TypeScalarizeVector : TypeSplitVector;
if (VT.getScalarSizeInBits() % 8 == 0)
return TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
- unsigned
- getNumRegisters(LLVMContext &Context, EVT VT,
- std::optional<MVT> RegisterVT) const override {
- // i128 inline assembly operand.
- if (VT == MVT::i128 && RegisterVT && *RegisterVT == MVT::Untyped)
- return 1;
- return TargetLowering::getNumRegisters(Context, VT);
- }
+ unsigned getNumRegisters(LLVMContext &Context, EVT VT,
+ std::optional<MVT> RegisterVT) const override;
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
- EVT VT) const override {
- // 128-bit single-element vector types are passed like other vectors,
- // not like their element type.
- if (VT.isVector() && VT.getSizeInBits() == 128 &&
- VT.getVectorNumElements() == 1)
- return MVT::v16i8;
- return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
- }
+ EVT VT) const override;
bool isCheapToSpeculateCtlz(Type *) const override { return true; }
bool isCheapToSpeculateCttz(Type *) const override { return true; }
bool preferZeroCompareBranch() const override { return true; }
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index 479bab5ce62b8..3eb66d06cc16d 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -348,6 +348,7 @@ let Predicates = [FeatureVector] in {
def VMRHH : BinaryVRRc<"vmrhh", 0xE761, z_merge_high, v128h, v128h, 1>;
def VMRHF : BinaryVRRc<"vmrhf", 0xE761, z_merge_high, v128f, v128f, 2>;
def VMRHG : BinaryVRRc<"vmrhg", 0xE761, z_merge_high, v128g, v128g, 3>;
+ def : BinaryRRWithType<VMRHH, VR128, z_merge_high, v8f16>;
def : BinaryRRWithType<VMRHF, VR128, z_merge_high, v4f32>;
def : BinaryRRWithType<VMRHG, VR128, z_merge_high, v2f64>;
@@ -357,6 +358,7 @@ let Predicates = [FeatureVector] in {
def VMRLH : BinaryVRRc<"vmrlh", 0xE760, z_merge_low, v128h, v128h, 1>;
def VMRLF : BinaryVRRc<"vmrlf", 0xE760, z_merge_low, v128f, v128f, 2>;
def VMRLG : BinaryVRRc<"vmrlg", 0xE760, z_merge_low, v128g, v128g, 3>;
+ def : BinaryRRWithType<VMRLH, VR128, z_merge_low, v8f16>;
def : BinaryRRWithType<VMRLF, VR128, z_merge_low, v4f32>;
def : BinaryRRWithType<VMRLG, VR128, z_merge_low, v2f64>;
@@ -497,6 +499,7 @@ defm : GenericVectorOps<v16i8, v16i8>;
defm : GenericVectorOps<v8i16, v8i16>;
defm : GenericVectorOps<v4i32, v4i32>;
defm : GenericVectorOps<v2i64, v2i64>;
+defm : GenericVectorOps<v8f16, v8i16>;
defm : GenericVectorOps<v4f32, v4i32>;
defm : GenericVectorOps<v2f64, v2i64>;
@@ -2110,6 +2113,7 @@ def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (i128 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8f16 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
def : Pat<(v16i8 (bitconvert (f128 VR128:$src))), (v16i8 VR128:$src)>;
@@ -2118,6 +2122,7 @@ def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (i128 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v8f16 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
def : Pat<(v8i16 (bitconvert (f128 VR128:$src))), (v8i16 VR128:$src)>;
@@ -2126,6 +2131,7 @@ def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (i128 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8f16 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
def : Pat<(v4i32 (bitconvert (f128 VR128:$src))), (v4i32 VR128:$src)>;
@@ -2134,15 +2140,26 @@ def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (i128 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8f16 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
def : Pat<(v2i64 (bitconvert (f128 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v16i8 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v8i16 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4i32 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2i64 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (i128 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v4f32 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (v2f64 VR128:$src))), (v8f16 VR128:$src)>;
+def : Pat<(v8f16 (bitconvert (f128 VR128:$src))), (v8f16 VR128:$src)>;
+
def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (i128 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8f16 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
def : Pat<(v4f32 (bitconvert (f128 VR128:$src))), (v4f32 VR128:$src)>;
@@ -2151,6 +2168,7 @@ def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (i128 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8f16 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
def : Pat<(v2f64 (bitconvert (f128 VR128:$src))), (v2f64 VR128:$src)>;
@@ -2159,6 +2177,7 @@ def : Pat<(f128 (bitconvert (v8i16 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (v4i32 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (v2i64 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (i128 VR128:$src))), (f128 VR128:$src)>;
+def : Pat<(f128 (bitconvert (v8f16 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (v4f32 VR128:$src))), (f128 VR128:$src)>;
def : Pat<(f128 (bitconvert (v2f64 VR128:$src))), (f128 VR128:$src)>;
@@ -2166,6 +2185,7 @@ def : Pat<(i128 (bitconvert (v16i8 VR128:$src))), (i128 VR128:$src)>;
def : Pat<(i128 (bitconvert (v8i16 VR128:$src))), (i128 VR128:$src)>;
def : Pat<(i128 (bitconvert (v4i32 VR128:$src))), (i128 VR128:$src)>;
def : Pat<(i128 (bitconvert (v2i64 VR128:$src))), (i128 VR128:$src)>;
+def : Pat<(i128 (bitconvert (v8f16 VR128:$src))), (i128 VR128:$src)>;
def : Pat<(i128 (bitconvert (v4f32 VR128:$src))), (i128 VR128:$src)>;
def : Pat<(i128 (bitconvert (v2f64 VR128:$src))), (i128 VR128:$src)>;
def : Pat<(i128 (bitconvert (f128 VR128:$src))), (i128 VR128:$src)>;
@@ -2216,6 +2236,7 @@ multiclass ScalarToVectorFP<Instruction vrep, ValueType vt, RegisterOperand cls,
(vrep (INSERT_SUBREG (vt (IMPLICIT_DEF)), cls:$scalar,
subreg), 0)>;
}
+defm : ScalarToVectorFP<VREPH, v8f16, FP16, subreg_h16>;
defm : ScalarToVectorFP<VREPF, v4f32, FP32, subreg_h32>;
defm : ScalarToVectorFP<VREPG, v2f64, FP64, subreg_h64>;
@@ -2236,6 +2257,11 @@ let AddedComplexity = 4 in {
// 3 added by TableGen for the base register operand in VLGV-based integer
// extractions and ensures that this version is strictly better.
let AddedComplexity = 4 in {
+ def : Pat<(f16 (z_vector_extract (v8f16 VR128:$vec), 0)),
+ (EXTRACT_SUBREG VR128:$vec, subreg_h16)>;
+ def : Pat<(f16 (z_vector_extract (v8f16 VR128:$vec), imm32zx3:$index)),
+ (EXTRACT_SUBREG (VREPH VR128:$vec, imm32zx2:$index), subreg_h16)>;
+
def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), 0)),
(EXTRACT_SUBREG VR128:$vec, subreg_h32)>;
def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), imm32zx2:$index)),
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
index e79f12b449a88..1ef8e81c8f829 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -305,13 +305,13 @@ defm VR64 : SystemZRegClass<"VR64", [f64, v8i8, v4i16, v2i32, v2f32], 64,
// The subset of vector registers that can be used for floating-point
// operations too.
defm VF128 : SystemZRegClass<"VF128",
- [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
- (sequence "V%u", 0, 15)>;
+ [v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
+ 128, (sequence "V%u", 0, 15)>;
// All vector registers.
defm VR128 : SystemZRegClass<"VR128",
[v16i8, v8i16, v4i32, v2i64, i128,
- v4f32, v2f64, f128],
+ v8f16, v4f32, v2f64, f128],
128, (add (sequence "V%u", 0, 7),
(sequence "V%u", 16, 31),
(sequence "V%u", 8, 15))>;
diff --git a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll
index e02f931c4d31e..d0f3414e89497 100644
--- a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll
+++ b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll
@@ -111,87 +111,93 @@ define void @canonicalize_ptr_f128(ptr %out) {
define <8 x half> @canonicalize_v8f16(<8 x half> %a) nounwind {
; Z16-LABEL: canonicalize_v8f16:
; Z16: # %bb.0:
-; Z16-NEXT: stmg %r13, %r15, 104(%r15)
+; Z16-NEXT: stmg %r14, %r15, 112(%r15)
; Z16-NEXT: aghi %r15, -224
-; Z16-NEXT: std %f8, 216(%r15) # 8-byte Spill
-; Z16-NEXT: std %f9, 208(%r15) # 8-byte Spill
-; Z16-NEXT: std %f10, 200(%r15) # 8-byte Spill
-; Z16-NEXT: std %f11, 192(%r15) # 8-byte Spill
-; Z16-NEXT: std %f12, 184(%r15) # 8-byte Spill
-; Z16-NEXT: std %f13, 17...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff origin/main HEAD --extensions c,h,cpp -- clang/test/CodeGen/SystemZ/systemz-abi-vector.c llvm/lib/Target/SystemZ/SystemZISelLowering.cpp llvm/lib/Target/SystemZ/SystemZISelLowering.h --diff_from_common_commit
View the diff from clang-format here.diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index b0b072a5b..c68e37508 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -850,7 +850,8 @@ unsigned SystemZTargetLowering::getVectorTypeBreakdownForCallingConv(
LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
unsigned &NumIntermediates, MVT &RegisterVT) const {
// Pass fp16 vectors in VR(s).
- if (Subtarget.hasVector() && VT.isVector() && VT.getScalarType() == MVT::f16) {
+ if (Subtarget.hasVector() && VT.isVector() &&
+ VT.getScalarType() == MVT::f16) {
IntermediateVT = RegisterVT = MVT::v8f16;
return NumIntermediates =
divideCeil(VT.getVectorNumElements(), SystemZ::VectorBytes / 2);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 2f78a285a..535cd6a38 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -82,11 +82,11 @@ public:
return 1;
return TargetLowering::getNumRegisters(Context, VT);
}
- unsigned
- getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC,
- EVT VT, EVT &IntermediateVT,
- unsigned &NumIntermediates,
- MVT &RegisterVT) const override;
+ unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context,
+ CallingConv::ID CC, EVT VT,
+ EVT &IntermediateVT,
+ unsigned &NumIntermediates,
+ MVT &RegisterVT) const override;
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
EVT VT) const override;
unsigned getNumRegistersForCallingConv(LLVMContext &Context,
|
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef([^a-zA-Z0-9_-]|$)|UndefValue::get)' 'HEAD~1' HEAD llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-select.ll llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll llvm/test/CodeGen/SystemZ/fp-half-vector-move.ll llvm/test/CodeGen/SystemZ/vec-abi-01.ll llvm/test/CodeGen/SystemZ/vec-abi-02.ll llvm/test/CodeGen/SystemZ/vec-abi-03.ll llvm/test/CodeGen/SystemZ/vec-abi-04.ll llvm/test/CodeGen/SystemZ/vec-abi-05.ll clang/test/CodeGen/SystemZ/systemz-abi-vector.c llvm/lib/Target/SystemZ/SystemZISelLowering.cpp llvm/lib/Target/SystemZ/SystemZISelLowering.h llvm/test/CodeGen/SystemZ/atomic-memops.ll llvm/test/CodeGen/SystemZ/canonicalize-vars.ll llvm/test/CodeGen/SystemZ/vec-cmp-09.ll llvm/test/CodeGen/SystemZ/vec-eswap-01.ll llvm/test/CodeGen/SystemZ/vec-eswap-02.ll llvm/test/CodeGen/SystemZ/vec-move-04.ll llvm/test/CodeGen/SystemZ/vec-move-05.ll llvm/test/CodeGen/SystemZ/vec-move-07.ll llvm/test/CodeGen/SystemZ/vec-move-10.ll llvm/test/CodeGen/SystemZ/vec-move-12.ll llvm/test/CodeGen/SystemZ/vec-perm-01.ll llvm/test/CodeGen/SystemZ/vec-perm-03.llThe following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
}Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
}Please refer to the Undefined Behavior Manual for more information. |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
| define void @fun0_arg(%Ty0 %A) { | ||
| ; CHECK-LABEL: fun0_arg: | ||
| ; CHECK: # %bb.0: | ||
| ; CHECK-NEXT: lgh %r0, 166(%r15) |
There was a problem hiding this comment.
I didn't do a full review yet, but just as a quick heads-up: this doesn't seem to be the correct ABI here. We said that <8 x half> should be handled like other vector types, which means for the pre-z13 ABI passed by reference. The code here however appears to treat it like passing 8 separate half values (first four in FPRs, rest in separate argument stack slots).
There was a problem hiding this comment.
I guess this is the same "LLVM IR ABI" type of arguments as with the {i128} - it works the same if I use <4 x float>. Maybe I should add some Clang FE test case to see the Z ABI compliant code being emitted?
|
Patch rebased. Testing added both for Clang FE and CodeGen. (NFC from before). |
|
Patch rebased + non-power of two tests removed. |
uweigand
left a comment
There was a problem hiding this comment.
Haven't looked at all the new tests in detail yet, but a couple of comments on the implementation so far.
| // Pass narrow fp16 vectors per the ABI even though they are generally | ||
| // expanded. | ||
| if (Subtarget.hasVector() && VT.isVector() && VT.getScalarType() == MVT::f16) | ||
| return divideCeil(VT.getVectorNumElements(), SystemZ::VectorBytes / 2); |
There was a problem hiding this comment.
Can this ever get called with non-power-of-two element numbers? Common getNumRegisters code also doesn't seem to handle this ...
There was a problem hiding this comment.
I would think it could - at least in theory - but I thought the divideCeil() would take care of that by widening e.g. a vector of 4 or 6 elements to take one vector register, or one with 12 elements to 2 registers?
There was a problem hiding this comment.
I think the case I was more concerned about is e.g. a 48-byte vector (24 half elements). Does this take 3 registers, or is this implicitly extended to the next power-of-two and thus takes 4 registers?
Waiting with VLLEZ, VLEH, ... patterns: Splitting "everything" for now. In some test cases this could be improved, e.g. with a vector in a register, a VLEH and then VST, which could actually be done. Maybe that can wait as long as it is functional... Found a derived test case with an IR that seems legal, but fails in compilation - will investigate it next. |
|
Think I found the problem I ran into -- see commit message for details. I have with this been able to take two .ll files, replace all uses of 'double' with 'half' (thousands of them), and it compiles. Doing this, there had been a lot of vectorization done with 'double'. I have gone over the handlings of v4f32 and duplicated/handled for v8f16 similarly. I hope there are no generic cases in the backend that have some assumption about float/double that would go wrong with 'half'. Is there any code with _Float16 that could be compiled and run, for further testing? |
|
getVectorTypeBreakdownForCallingConv() + tests |
|
Patch rebased (now on top of the recent commit for SP alignment), and tests updated. vec-abi-01.ll and vec-abi-02.ll have the alignemts as generated by clang. The alignments are different with or w/out vector support, and the dynamic alignments are gone with vector support (vec-abi-02.ll). @arsenm @paulwalker-arm @pkarveti Does this patch look reasonable/correct to you? Short summary:
|
In preparation for #171066 (FP16 vector support).
In preparation for llvm#171066 (FP16 vector support).
In preparation for llvm#171066 (FP16 vector support).
uweigand
left a comment
There was a problem hiding this comment.
This version now generally looks good to me, thanks! A couple of remaining cosmetic issues inline.
The tests show a bit of room for improvement in codegen, but I think this can be handled in follow-on patches where it makes sense.
|
Should we backport this to 22.x? |
Given that this fixes an ABI issue, I think we should. @JonPsson1 can you prepare a backport? |
|
/cherry-pick 999e9a |
|
Failed to cherry-pick: 999e9a https://github.com/llvm/llvm-project/actions/runs/21408016122 Please manually backport the fix and push it to your github fork. Once this is done, please create a pull request |
|
@tstellar Looking at the failed cherry-pick, I see such things as "github-automation.py:57: SyntaxWarning: invalid escape sequence '\w'", rather than any merge problems. Would you know what's going wrong? Thanks. |
|
/cherry-pick c999e9a You provided an incomplete commit hash:
|
|
Not sure about this error, but I would in fact expect merge problems because of the other patch you precommitted ... |
|
Failed to cherry-pick: c999e9a https://github.com/llvm/llvm-project/actions/runs/21413991803 Please manually backport the fix and push it to your github fork. Once this is done, please create a pull request |
|
/cherry-pick e0a1326 c999e9a |
|
Failed to create pull request for issue171066 https://github.com/llvm/llvm-project/actions/runs/21414296599 |
|
I'll try to do this separately then for the pre-commit first. |
|
/pull-request #178300 |
|
@JonPsson1 It's a bit confusing due to how it's rendered, but you need to use full commit hashes like |
I copied the hashes from my local repo so it's surprising they would become ambiguous. Anyway, thanks for the help, and I will use full hashes next time... |
- Make v8f16 a legal type so that arguments can be passed in vector registers. Handle fp16 vectors so that they have the same ABI as other fp vectors. - Set the preferred vector action for fp16 vectors to "split". This will scalarize all operations, which is not always necessary (like with memory operations), but it avoids the superfluous operations that result after first widening and then scalarizing a narrow vector (like v4f16). Fixes llvm#168992
In preparation for llvm#171066 (FP16 vector support). (cherry picked from commit e0a1326)
- Make v8f16 a legal type so that arguments can be passed in vector registers. Handle fp16 vectors so that they have the same ABI as other fp vectors. - Set the preferred vector action for fp16 vectors to "split". This will scalarize all operations, which is not always necessary (like with memory operations), but it avoids the superfluous operations that result after first widening and then scalarizing a narrow vector (like v4f16). Fixes llvm#168992 (cherry picked from commit c999e9a)
- Make v8f16 a legal type so that arguments can be passed in vector registers. Handle fp16 vectors so that they have the same ABI as other fp vectors. - Set the preferred vector action for fp16 vectors to "split". This will scalarize all operations, which is not always necessary (like with memory operations), but it avoids the superfluous operations that result after first widening and then scalarizing a narrow vector (like v4f16). Fixes llvm#168992
Make v8f16 a legal type so that arguments can be passed in vector registers. Handle fp16 vectors so that they have the same ABI as other fp vectors.
Set the preferred vector action for fp16 vectors to "split". This will scalarize all operations, which is not always necessary (like with memory operations), but it avoids the superfluous operations that result after first widening and then scalarizing a narrow vector (like v4f16).
This seems to be the better handling, at least if narrow (e.g. 4 element) vectors are considered and not supposed to get all 8 ops emitted. Patch in progress, with some tests in place that are passing. There are several opportunities for optimizing the results, but not sure if or what would be relevant at this point, with the conversion routines having a heavy overhead.
Fixes #168992