diff --git a/clang/test/CodeGen/SystemZ/systemz-abi-vector.c b/clang/test/CodeGen/SystemZ/systemz-abi-vector.c index fab6050a0d876..0c577723c6265 100644 --- a/clang/test/CodeGen/SystemZ/systemz-abi-vector.c +++ b/clang/test/CodeGen/SystemZ/systemz-abi-vector.c @@ -29,16 +29,19 @@ typedef __attribute__((vector_size(1))) char v1i8; typedef __attribute__((vector_size(2))) char v2i8; typedef __attribute__((vector_size(2))) short v1i16; +typedef __attribute__((vector_size(2))) _Float16 v1f16; typedef __attribute__((vector_size(4))) char v4i8; typedef __attribute__((vector_size(4))) short v2i16; typedef __attribute__((vector_size(4))) int v1i32; +typedef __attribute__((vector_size(4))) _Float16 v2f16; typedef __attribute__((vector_size(4))) float v1f32; typedef __attribute__((vector_size(8))) char v8i8; typedef __attribute__((vector_size(8))) short v4i16; typedef __attribute__((vector_size(8))) int v2i32; typedef __attribute__((vector_size(8))) long long v1i64; +typedef __attribute__((vector_size(8))) _Float16 v4f16; typedef __attribute__((vector_size(8))) float v2f32; typedef __attribute__((vector_size(8))) double v1f64; @@ -47,11 +50,20 @@ typedef __attribute__((vector_size(16))) short v8i16; typedef __attribute__((vector_size(16))) int v4i32; typedef __attribute__((vector_size(16))) long long v2i64; typedef __attribute__((vector_size(16))) __int128 v1i128; +typedef __attribute__((vector_size(16))) _Float16 v8f16; typedef __attribute__((vector_size(16))) float v4f32; typedef __attribute__((vector_size(16))) double v2f64; typedef __attribute__((vector_size(16))) long double v1f128; typedef __attribute__((vector_size(32))) char v32i8; +typedef __attribute__((vector_size(32))) short v16i16; +typedef __attribute__((vector_size(32))) int v8i32; +typedef __attribute__((vector_size(32))) long long v4i64; +typedef __attribute__((vector_size(32))) __int128 v2i128; +typedef __attribute__((vector_size(32))) _Float16 v16f16; +typedef __attribute__((vector_size(32))) float v8f32; +typedef __attribute__((vector_size(32))) double v4f64; +typedef __attribute__((vector_size(32))) long double v2f128; unsigned int align = __alignof__ (v16i8); // CHECK: @align ={{.*}} global i32 16 @@ -97,6 +109,10 @@ v8i16 pass_v8i16(v8i16 arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_v8i16(ptr dead_on_unwind noalias writable sret(<8 x i16>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <8 x i16> @pass_v8i16(<8 x i16> %{{.*}}) +v16i16 pass_v16i16(v16i16 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_v16i16(ptr dead_on_unwind noalias writable sret(<16 x i16>) align 32 %{{.*}}, ptr dead_on_return %0) +// CHECK-VECTOR-LABEL: define{{.*}} void @pass_v16i16(ptr dead_on_unwind noalias writable sret(<16 x i16>) align 8 %{{.*}}, ptr dead_on_return %0) + v1i32 pass_v1i32(v1i32 arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_v1i32(ptr dead_on_unwind noalias writable sret(<1 x i32>) align 4 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x i32> @pass_v1i32(<1 x i32> %{{.*}}) @@ -109,6 +125,10 @@ v4i32 pass_v4i32(v4i32 arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_v4i32(ptr dead_on_unwind noalias writable sret(<4 x i32>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <4 x i32> @pass_v4i32(<4 x i32> %{{.*}}) +v8i32 pass_v8i32(v8i32 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_v8i32(ptr dead_on_unwind noalias writable sret(<8 x i32>) align 32 %{{.*}}, ptr dead_on_return %0) +// CHECK-VECTOR-LABEL: define{{.*}} void @pass_v8i32(ptr dead_on_unwind noalias writable sret(<8 x i32>) align 8 %{{.*}}, ptr dead_on_return %0) + v1i64 pass_v1i64(v1i64 arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_v1i64(ptr dead_on_unwind noalias writable sret(<1 x i64>) align 8 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x i64> @pass_v1i64(<1 x i64> %{{.*}}) @@ -117,10 +137,38 @@ v2i64 pass_v2i64(v2i64 arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_v2i64(ptr dead_on_unwind noalias writable sret(<2 x i64>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <2 x i64> @pass_v2i64(<2 x i64> %{{.*}}) +v4i64 pass_v4i64(v4i64 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_v4i64(ptr dead_on_unwind noalias writable sret(<4 x i64>) align 32 %{{.*}}, ptr dead_on_return %0) +// CHECK-VECTOR-LABEL: define{{.*}} void @pass_v4i64(ptr dead_on_unwind noalias writable sret(<4 x i64>) align 8 %{{.*}}, ptr dead_on_return %0) + v1i128 pass_v1i128(v1i128 arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_v1i128(ptr dead_on_unwind noalias writable sret(<1 x i128>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x i128> @pass_v1i128(<1 x i128> %{{.*}}) +v2i128 pass_v2i128(v2i128 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_v2i128(ptr dead_on_unwind noalias writable sret(<2 x i128>) align 32 %{{.*}}, ptr dead_on_return %0) +// CHECK-VECTOR-LABEL: define{{.*}} void @pass_v2i128(ptr dead_on_unwind noalias writable sret(<2 x i128>) align 8 %{{.*}}, ptr dead_on_return %0) + +v1f16 pass_v1f16(v1f16 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_v1f16(ptr dead_on_unwind noalias writable sret(<1 x half>) align 2 %{{.*}}, ptr dead_on_return %0) +// CHECK-VECTOR-LABEL: define{{.*}} <1 x half> @pass_v1f16(<1 x half> %{{.*}}) + +v2f16 pass_v2f16(v2f16 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_v2f16(ptr dead_on_unwind noalias writable sret(<2 x half>) align 4 %{{.*}}, ptr dead_on_return %0) +// CHECK-VECTOR-LABEL: define{{.*}} <2 x half> @pass_v2f16(<2 x half> %{{.*}}) + +v4f16 pass_v4f16(v4f16 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_v4f16(ptr dead_on_unwind noalias writable sret(<4 x half>) align 8 %{{.*}}, ptr dead_on_return %0) +// CHECK-VECTOR-LABEL: define{{.*}} <4 x half> @pass_v4f16(<4 x half> %{{.*}}) + +v8f16 pass_v8f16(v8f16 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_v8f16(ptr dead_on_unwind noalias writable sret(<8 x half>) align 16 %{{.*}}, ptr dead_on_return %0) +// CHECK-VECTOR-LABEL: define{{.*}} <8 x half> @pass_v8f16(<8 x half> %{{.*}}) + +v16f16 pass_v16f16(v16f16 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_v16f16(ptr dead_on_unwind noalias writable sret(<16 x half>) align 32 %{{.*}}, ptr dead_on_return %0) +// CHECK-VECTOR-LABEL: define{{.*}} void @pass_v16f16(ptr dead_on_unwind noalias writable sret(<16 x half>) align 8 %{{.*}}, ptr dead_on_return %0) + v1f32 pass_v1f32(v1f32 arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_v1f32(ptr dead_on_unwind noalias writable sret(<1 x float>) align 4 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x float> @pass_v1f32(<1 x float> %{{.*}}) @@ -133,6 +181,10 @@ v4f32 pass_v4f32(v4f32 arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_v4f32(ptr dead_on_unwind noalias writable sret(<4 x float>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <4 x float> @pass_v4f32(<4 x float> %{{.*}}) +v8f32 pass_v8f32(v8f32 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_v8f32(ptr dead_on_unwind noalias writable sret(<8 x float>) align 32 %{{.*}}, ptr dead_on_return %0) +// CHECK-VECTOR-LABEL: define{{.*}} void @pass_v8f32(ptr dead_on_unwind noalias writable sret(<8 x float>) align 8 %{{.*}}, ptr dead_on_return %0) + v1f64 pass_v1f64(v1f64 arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_v1f64(ptr dead_on_unwind noalias writable sret(<1 x double>) align 8 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x double> @pass_v1f64(<1 x double> %{{.*}}) @@ -141,10 +193,17 @@ v2f64 pass_v2f64(v2f64 arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_v2f64(ptr dead_on_unwind noalias writable sret(<2 x double>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <2 x double> @pass_v2f64(<2 x double> %{{.*}}) +v4f64 pass_v4f64(v4f64 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_v4f64(ptr dead_on_unwind noalias writable sret(<4 x double>) align 32 %{{.*}}, ptr dead_on_return %0) +// CHECK-VECTOR-LABEL: define{{.*}} void @pass_v4f64(ptr dead_on_unwind noalias writable sret(<4 x double>) align 8 %{{.*}}, ptr dead_on_return %0) + v1f128 pass_v1f128(v1f128 arg) { return arg; } // CHECK-LABEL: define{{.*}} void @pass_v1f128(ptr dead_on_unwind noalias writable sret(<1 x fp128>) align 16 %{{.*}}, ptr dead_on_return %0) // CHECK-VECTOR-LABEL: define{{.*}} <1 x fp128> @pass_v1f128(<1 x fp128> %{{.*}}) +v2f128 pass_v2f128(v2f128 arg) { return arg; } +// CHECK-LABEL: define{{.*}} void @pass_v2f128(ptr dead_on_unwind noalias writable sret(<2 x fp128>) align 32 %{{.*}}, ptr dead_on_return %0) +// CHECK-VECTOR-LABEL: define{{.*}} void @pass_v2f128(ptr dead_on_unwind noalias writable sret(<2 x fp128>) align 8 %{{.*}}, ptr dead_on_return %0) // Vector-like aggregate types diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td index 2795de5eeeb66..69202e3fcbc57 100644 --- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td +++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td @@ -50,7 +50,7 @@ def RetCC_SystemZ_ELF : CallingConv<[ // Sub-128 vectors are returned in the same way, but they're widened // to one of these types during type legalization. CCIfSubtarget<"hasVector()", - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCAssignToReg<[V24, V26, V28, V30, V25, V27, V29, V31]>>> ]>; @@ -116,19 +116,19 @@ def CC_SystemZ_ELF : CallingConv<[ // are passed in the same way, but they're widened to one of these types // during type legalization. CCIfSubtarget<"hasVector()", - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCIfArgFixed>>>, // However, sub-128 vectors which need to go on the stack occupy just a // single 8-byte-aligned 8-byte stack slot. Pass as i64. CCIfSubtarget<"hasVector()", - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCIfShortVector>>>, // Other vector arguments are passed in 8-byte-aligned 16-byte stack slots. CCIfSubtarget<"hasVector()", - CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCAssignToStack<16, 8>>>, // Other arguments are passed in 8-byte-aligned 8-byte stack slots. diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 3bc90a16f51db..b0b072a5b1f26 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -123,6 +123,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass); addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass); addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass); + addRegisterClass(MVT::v8f16, &SystemZ::VR128BitRegClass); addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass); addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass); } @@ -620,13 +621,16 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // Handle floating-point vector types. if (Subtarget.hasVector()) { // Scalar-to-vector conversion is just a subreg. + setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Legal); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2f64, Legal); // Some insertions and extractions can be done directly but others // need to go via integers. + setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); @@ -842,6 +846,19 @@ bool SystemZTargetLowering::useSoftFloat() const { return Subtarget.hasSoftFloat(); } +unsigned SystemZTargetLowering::getVectorTypeBreakdownForCallingConv( + LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, MVT &RegisterVT) const { + // Pass fp16 vectors in VR(s). + if (Subtarget.hasVector() && VT.isVector() && VT.getScalarType() == MVT::f16) { + IntermediateVT = RegisterVT = MVT::v8f16; + return NumIntermediates = + divideCeil(VT.getVectorNumElements(), SystemZ::VectorBytes / 2); + } + return TargetLowering::getVectorTypeBreakdownForCallingConv( + Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); +} + MVT SystemZTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { @@ -850,9 +867,20 @@ MVT SystemZTargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, if (VT.isVector() && VT.getSizeInBits() == 128 && VT.getVectorNumElements() == 1) return MVT::v16i8; + // Pass fp16 vectors in VR(s). + if (Subtarget.hasVector() && VT.isVector() && VT.getScalarType() == MVT::f16) + return MVT::v8f16; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } +unsigned SystemZTargetLowering::getNumRegistersForCallingConv( + LLVMContext &Context, CallingConv::ID CC, EVT VT) const { + // Pass fp16 vectors in VR(s). + if (Subtarget.hasVector() && VT.isVector() && VT.getScalarType() == MVT::f16) + return divideCeil(VT.getVectorNumElements(), SystemZ::VectorBytes / 2); + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); +} + EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const { if (!VT.isVector()) @@ -2063,6 +2091,7 @@ SDValue SystemZTargetLowering::LowerFormalArguments( case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: + case MVT::v8f16: case MVT::v4f32: case MVT::v2f64: RC = &SystemZ::VR128BitRegClass; @@ -6358,6 +6387,38 @@ bool SystemZTargetLowering::isVectorElementLoad(SDValue Op) const { return false; } +static SDValue mergeHighParts(SelectionDAG &DAG, const SDLoc &DL, + unsigned MergedBits, EVT VT, SDValue Op0, + SDValue Op1) { + MVT IntVecVT = MVT::getVectorVT(MVT::getIntegerVT(MergedBits), + SystemZ::VectorBits / MergedBits); + assert(VT.getSizeInBits() == 128 && IntVecVT.getSizeInBits() == 128 && + "Handling full vectors only."); + Op0 = DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0); + Op1 = DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op1); + SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH, DL, IntVecVT, Op0, Op1); + return DAG.getNode(ISD::BITCAST, DL, VT, Op); +} + +static SDValue buildFPVecFromScalars4(SelectionDAG &DAG, const SDLoc &DL, + EVT VT, SmallVectorImpl &Elems, + unsigned Pos) { + SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 0], Elems[Pos + 1]); + SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[Pos + 2], Elems[Pos + 3]); + // Avoid unnecessary undefs by reusing the other operand. + if (Op01.isUndef()) { + if (Op23.isUndef()) + return Op01; + Op01 = Op23; + } else if (Op23.isUndef()) + Op23 = Op01; + // Merging identical replications is a no-op. + if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23) + return Op01; + unsigned MergedBits = VT.getSimpleVT().getScalarSizeInBits() * 2; + return mergeHighParts(DAG, DL, MergedBits, VT, Op01, Op23); +} + // Combine GPR scalar values Elems into a vector of type VT. SDValue SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT, @@ -6416,22 +6477,22 @@ SystemZTargetLowering::buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT, // // V VMRHG // - if (VT == MVT::v4f32 && !AllLoads) { - SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]); - SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]); + if (VT == MVT::v4f32 && !AllLoads) + return buildFPVecFromScalars4(DAG, DL, VT, Elems, 0); + + // Same for v8f16. + if (VT == MVT::v8f16 && !AllLoads) { + SDValue Op0123 = buildFPVecFromScalars4(DAG, DL, VT, Elems, 0); + SDValue Op4567 = buildFPVecFromScalars4(DAG, DL, VT, Elems, 4); // Avoid unnecessary undefs by reusing the other operand. - if (Op01.isUndef()) - Op01 = Op23; - else if (Op23.isUndef()) - Op23 = Op01; + if (Op0123.isUndef()) + Op0123 = Op4567; + else if (Op4567.isUndef()) + Op4567 = Op0123; // Merging identical replications is a no-op. - if (Op01.getOpcode() == SystemZISD::REPLICATE && Op01 == Op23) - return Op01; - Op01 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op01); - Op23 = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Op23); - SDValue Op = DAG.getNode(SystemZISD::MERGE_HIGH, - DL, MVT::v2i64, Op01, Op23); - return DAG.getNode(ISD::BITCAST, DL, VT, Op); + if (Op0123.getOpcode() == SystemZISD::REPLICATE && Op0123 == Op4567) + return Op0123; + return mergeHighParts(DAG, DL, 64, VT, Op0123, Op4567); } // Collect the constant terms. @@ -6627,9 +6688,13 @@ SDValue SystemZTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, // Otherwise bitcast to the equivalent integer form and insert via a GPR. MVT IntVT = MVT::getIntegerVT(VT.getScalarSizeInBits()); MVT IntVecVT = MVT::getVectorVT(IntVT, VT.getVectorNumElements()); - SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntVecVT, - DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0), - DAG.getNode(ISD::BITCAST, DL, IntVT, Op1), Op2); + SDValue IntOp1 = + VT == MVT::v8f16 + ? DAG.getZExtOrTrunc(convertFromF16(Op1, DL, DAG), DL, MVT::i32) + : DAG.getNode(ISD::BITCAST, DL, IntVT, Op1); + SDValue Res = + DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, IntVecVT, + DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0), IntOp1, Op2); return DAG.getNode(ISD::BITCAST, DL, VT, Res); } @@ -6654,9 +6719,12 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, // Otherwise bitcast to the equivalent integer form and extract via a GPR. MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits()); MVT IntVecVT = MVT::getVectorVT(IntVT, VecVT.getVectorNumElements()); - SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IntVT, - DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0), Op1); - return DAG.getNode(ISD::BITCAST, DL, VT, Res); + MVT ExtrVT = IntVT == MVT::i16 ? MVT::i32 : IntVT; + SDValue Extr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrVT, + DAG.getNode(ISD::BITCAST, DL, IntVecVT, Op0), Op1); + if (VT == MVT::f16) + return convertToF16(DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Extr), DAG); + return DAG.getNode(ISD::BITCAST, DL, VT, Extr); } SDValue SystemZTargetLowering:: diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index 2af00a5cde7ad..2f78a285ae006 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -64,6 +64,12 @@ class SystemZTargetLowering : public TargetLowering { // // (c) there are no multiplication instructions for the widest integer // type (v2i64). + + // Expand (narrow) f16 vectors during type legalization to avoid + // operations for all elements as with expansion after widening. + if (VT.getScalarType() == MVT::f16) + return VT.getVectorElementCount().isScalar() ? TypeScalarizeVector + : TypeSplitVector; if (VT.getScalarSizeInBits() % 8 == 0) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); @@ -76,8 +82,16 @@ class SystemZTargetLowering : public TargetLowering { return 1; return TargetLowering::getNumRegisters(Context, VT); } + unsigned + getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, + EVT VT, EVT &IntermediateVT, + unsigned &NumIntermediates, + MVT &RegisterVT) const override; MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override; + unsigned getNumRegistersForCallingConv(LLVMContext &Context, + CallingConv::ID CC, + EVT VT) const override; bool isCheapToSpeculateCtlz(Type *) const override { return true; } bool isCheapToSpeculateCttz(Type *) const override { return true; } bool preferZeroCompareBranch() const override { return true; } diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td index 479bab5ce62b8..eb5753cfcde99 100644 --- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td +++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td @@ -133,6 +133,8 @@ let Predicates = [FeatureVector] in { def VLREPH : UnaryVRX<"vlreph", 0xE705, z_replicate_loadi16, v128h, 2, 1>; def VLREPF : UnaryVRX<"vlrepf", 0xE705, z_replicate_loadi32, v128f, 4, 2>; def VLREPG : UnaryVRX<"vlrepg", 0xE705, z_replicate_loadi64, v128g, 8, 3>; + def : Pat<(v8f16 (z_replicate_loadf16 bdxaddr12only:$addr)), + (VLREPH bdxaddr12only:$addr)>; def : Pat<(v4f32 (z_replicate_loadf32 bdxaddr12only:$addr)), (VLREPF bdxaddr12only:$addr)>; def : Pat<(v2f64 (z_replicate_loadf64 bdxaddr12only:$addr)), @@ -229,6 +231,9 @@ let Predicates = [FeatureVector] in { def VSTEH : StoreBinaryVRX<"vsteh", 0xE709, z_vstei16, v128h, 2, imm32zx3>; def VSTEF : StoreBinaryVRX<"vstef", 0xE70B, z_vstei32, v128f, 4, imm32zx2>; def VSTEG : StoreBinaryVRX<"vsteg", 0xE70A, z_vstei64, v128g, 8, imm32zx1>; + def : Pat<(z_vstef16 (v8f16 VR128:$val), bdxaddr12only:$addr, + imm32zx3:$index), + (VSTEH VR128:$val, bdxaddr12only:$addr, imm32zx2:$index)>; def : Pat<(z_vstef32 (v4f32 VR128:$val), bdxaddr12only:$addr, imm32zx2:$index), (VSTEF VR128:$val, bdxaddr12only:$addr, imm32zx2:$index)>; @@ -279,6 +284,8 @@ let Predicates = [FeatureVectorEnhancements2] in { def VLERH : UnaryVRX<"vlerh", 0xE607, z_loadeswap, v128h, 16, 1>; def VLERF : UnaryVRX<"vlerf", 0xE607, z_loadeswap, v128f, 16, 2>; def VLERG : UnaryVRX<"vlerg", 0xE607, z_loadeswap, v128g, 16, 3>; + def : Pat<(v8f16 (z_loadeswap bdxaddr12only:$addr)), + (VLERH bdxaddr12only:$addr)>; def : Pat<(v4f32 (z_loadeswap bdxaddr12only:$addr)), (VLERF bdxaddr12only:$addr)>; def : Pat<(v2f64 (z_loadeswap bdxaddr12only:$addr)), @@ -320,6 +327,8 @@ let Predicates = [FeatureVectorEnhancements2] in { def VSTERH : StoreVRX<"vsterh", 0xE60F, z_storeeswap, v128h, 16, 1>; def VSTERF : StoreVRX<"vsterf", 0xE60F, z_storeeswap, v128f, 16, 2>; def VSTERG : StoreVRX<"vsterg", 0xE60F, z_storeeswap, v128g, 16, 3>; + def : Pat<(z_storeeswap (v8f16 VR128:$val), bdxaddr12only:$addr), + (VSTERH VR128:$val, bdxaddr12only:$addr)>; def : Pat<(z_storeeswap (v4f32 VR128:$val), bdxaddr12only:$addr), (VSTERF VR128:$val, bdxaddr12only:$addr)>; def : Pat<(z_storeeswap (v2f64 VR128:$val), bdxaddr12only:$addr), @@ -348,6 +357,7 @@ let Predicates = [FeatureVector] in { def VMRHH : BinaryVRRc<"vmrhh", 0xE761, z_merge_high, v128h, v128h, 1>; def VMRHF : BinaryVRRc<"vmrhf", 0xE761, z_merge_high, v128f, v128f, 2>; def VMRHG : BinaryVRRc<"vmrhg", 0xE761, z_merge_high, v128g, v128g, 3>; + def : BinaryRRWithType; def : BinaryRRWithType; def : BinaryRRWithType; @@ -357,6 +367,7 @@ let Predicates = [FeatureVector] in { def VMRLH : BinaryVRRc<"vmrlh", 0xE760, z_merge_low, v128h, v128h, 1>; def VMRLF : BinaryVRRc<"vmrlf", 0xE760, z_merge_low, v128f, v128f, 2>; def VMRLG : BinaryVRRc<"vmrlg", 0xE760, z_merge_low, v128g, v128g, 3>; + def : BinaryRRWithType; def : BinaryRRWithType; def : BinaryRRWithType; @@ -376,6 +387,8 @@ let Predicates = [FeatureVector] in { def VREPH : BinaryVRIc<"vreph", 0xE74D, z_splat, v128h, v128h, 1>; def VREPF : BinaryVRIc<"vrepf", 0xE74D, z_splat, v128f, v128f, 2>; def VREPG : BinaryVRIc<"vrepg", 0xE74D, z_splat, v128g, v128g, 3>; + def : Pat<(v8f16 (z_splat VR128:$vec, imm32zx16_timm:$index)), + (VREPH VR128:$vec, imm32zx16:$index)>; def : Pat<(v4f32 (z_splat VR128:$vec, imm32zx16_timm:$index)), (VREPF VR128:$vec, imm32zx16:$index)>; def : Pat<(v2f64 (z_splat VR128:$vec, imm32zx16_timm:$index)), @@ -497,6 +510,7 @@ defm : GenericVectorOps; defm : GenericVectorOps; defm : GenericVectorOps; defm : GenericVectorOps; +defm : GenericVectorOps; defm : GenericVectorOps; defm : GenericVectorOps; @@ -516,6 +530,7 @@ defm : BlendVectorOps; defm : BlendVectorOps; defm : BlendVectorOps; defm : BlendVectorOps; +defm : BlendVectorOps; defm : BlendVectorOps; defm : BlendVectorOps; @@ -2110,6 +2125,7 @@ def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (i128 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8f16 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (f128 VR128:$src))), (v16i8 VR128:$src)>; @@ -2118,6 +2134,7 @@ def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (i128 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v8f16 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (f128 VR128:$src))), (v8i16 VR128:$src)>; @@ -2126,6 +2143,7 @@ def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (i128 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8f16 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (f128 VR128:$src))), (v4i32 VR128:$src)>; @@ -2134,15 +2152,26 @@ def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (i128 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8f16 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (f128 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (v16i8 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (v8i16 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (v4i32 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (v2i64 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (i128 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (v4f32 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (v2f64 VR128:$src))), (v8f16 VR128:$src)>; +def : Pat<(v8f16 (bitconvert (f128 VR128:$src))), (v8f16 VR128:$src)>; + def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (i128 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8f16 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (f128 VR128:$src))), (v4f32 VR128:$src)>; @@ -2151,6 +2180,7 @@ def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (i128 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (v8f16 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (f128 VR128:$src))), (v2f64 VR128:$src)>; @@ -2159,6 +2189,7 @@ def : Pat<(f128 (bitconvert (v8i16 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (v4i32 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (v2i64 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (i128 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v8f16 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (v4f32 VR128:$src))), (f128 VR128:$src)>; def : Pat<(f128 (bitconvert (v2f64 VR128:$src))), (f128 VR128:$src)>; @@ -2166,6 +2197,7 @@ def : Pat<(i128 (bitconvert (v16i8 VR128:$src))), (i128 VR128:$src)>; def : Pat<(i128 (bitconvert (v8i16 VR128:$src))), (i128 VR128:$src)>; def : Pat<(i128 (bitconvert (v4i32 VR128:$src))), (i128 VR128:$src)>; def : Pat<(i128 (bitconvert (v2i64 VR128:$src))), (i128 VR128:$src)>; +def : Pat<(i128 (bitconvert (v8f16 VR128:$src))), (i128 VR128:$src)>; def : Pat<(i128 (bitconvert (v4f32 VR128:$src))), (i128 VR128:$src)>; def : Pat<(i128 (bitconvert (v2f64 VR128:$src))), (i128 VR128:$src)>; def : Pat<(i128 (bitconvert (f128 VR128:$src))), (i128 VR128:$src)>; @@ -2216,6 +2248,7 @@ multiclass ScalarToVectorFP; } +defm : ScalarToVectorFP; defm : ScalarToVectorFP; defm : ScalarToVectorFP; @@ -2236,6 +2269,11 @@ let AddedComplexity = 4 in { // 3 added by TableGen for the base register operand in VLGV-based integer // extractions and ensures that this version is strictly better. let AddedComplexity = 4 in { + def : Pat<(f16 (z_vector_extract (v8f16 VR128:$vec), 0)), + (EXTRACT_SUBREG VR128:$vec, subreg_h16)>; + def : Pat<(f16 (z_vector_extract (v8f16 VR128:$vec), imm32zx3:$index)), + (EXTRACT_SUBREG (VREPH VR128:$vec, imm32zx2:$index), subreg_h16)>; + def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), 0)), (EXTRACT_SUBREG VR128:$vec, subreg_h32)>; def : Pat<(f32 (z_vector_extract (v4f32 VR128:$vec), imm32zx2:$index)), diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td index a02cafaaafcdf..2a5b0435c1565 100644 --- a/llvm/lib/Target/SystemZ/SystemZOperators.td +++ b/llvm/lib/Target/SystemZ/SystemZOperators.td @@ -1195,6 +1195,7 @@ def z_replicate_loadi8 : z_replicate_load; def z_replicate_loadi16 : z_replicate_load; def z_replicate_loadi32 : z_replicate_load; def z_replicate_loadi64 : z_replicate_load; +def z_replicate_loadf16 : z_replicate_load; def z_replicate_loadf32 : z_replicate_load; def z_replicate_loadf64 : z_replicate_load; // Byte-swapped replicated vector element loads. @@ -1211,6 +1212,7 @@ def z_vlei8 : z_vle; def z_vlei16 : z_vle; def z_vlei32 : z_vle; def z_vlei64 : z_vle; +def z_vlef16 : z_vle; def z_vlef32 : z_vle; def z_vlef64 : z_vle; // Byte-swapped vector element loads. @@ -1282,6 +1284,7 @@ def z_vstei8 : z_vste; def z_vstei16 : z_vste; def z_vstei32 : z_vste; def z_vstei64 : z_vste; +def z_vstef16 : z_vste; def z_vstef32 : z_vste; def z_vstef64 : z_vste; // Byte-swapped vector element stores. diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td index e79f12b449a88..1ef8e81c8f829 100644 --- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -305,13 +305,13 @@ defm VR64 : SystemZRegClass<"VR64", [f64, v8i8, v4i16, v2i32, v2f32], 64, // The subset of vector registers that can be used for floating-point // operations too. defm VF128 : SystemZRegClass<"VF128", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, - (sequence "V%u", 0, 15)>; + [v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + 128, (sequence "V%u", 0, 15)>; // All vector registers. defm VR128 : SystemZRegClass<"VR128", [v16i8, v8i16, v4i32, v2i64, i128, - v4f32, v2f64, f128], + v8f16, v4f32, v2f64, f128], 128, (add (sequence "V%u", 0, 7), (sequence "V%u", 16, 31), (sequence "V%u", 8, 15))>; diff --git a/llvm/test/CodeGen/SystemZ/atomic-memops.ll b/llvm/test/CodeGen/SystemZ/atomic-memops.ll index 0bc647aa0e0f7..ae2a74d030caf 100644 --- a/llvm/test/CodeGen/SystemZ/atomic-memops.ll +++ b/llvm/test/CodeGen/SystemZ/atomic-memops.ll @@ -396,6 +396,18 @@ define void @f24(ptr %src, ptr %dst) { ret void } +define void @f25_half(ptr %src, ptr %dst) { +; CHECK-LABEL: f25_half: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %b = load atomic half, ptr %src seq_cst, align 2 + %v = insertelement <8 x half> undef, half %b, i32 1 + store volatile <8 x half> %v, ptr %dst + ret void +} + define void @f25(ptr %src, ptr %dst) { ; CHECK-LABEL: f25: ; CHECK: # %bb.0: @@ -614,7 +626,7 @@ define void @f43(ptr %ptr) { define void @f44(ptr %ptr) { ; CHECK-LABEL: f44: ; CHECK: # %bb.0: -; CHECK-NEXT: larl %r1, .LCPI49_0 +; CHECK-NEXT: larl %r1, .LCPI50_0 ; CHECK-NEXT: ld %f0, 0(%r1) ; CHECK-NEXT: std %f0, 0(%r2) ; CHECK-NEXT: bcr 14, %r0 @@ -669,6 +681,17 @@ define void @f48(<2 x i64> %val, ptr %ptr) { ret void } +define void @f49_half(<8 x half> %val, ptr %ptr) { +; CHECK-LABEL: f49_half: +; CHECK: # %bb.0: +; CHECK-NEXT: vsteh %v24, 0(%r2), 0 +; CHECK-NEXT: bcr 14, %r0 +; CHECK-NEXT: br %r14 + %element = extractelement <8 x half> %val, i32 0 + store atomic half %element, ptr %ptr seq_cst, align 4 + ret void +} + define void @f49(<4 x float> %val, ptr %ptr) { ; CHECK-LABEL: f49: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll index e02f931c4d31e..d0f3414e89497 100644 --- a/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll +++ b/llvm/test/CodeGen/SystemZ/canonicalize-vars.ll @@ -111,87 +111,93 @@ define void @canonicalize_ptr_f128(ptr %out) { define <8 x half> @canonicalize_v8f16(<8 x half> %a) nounwind { ; Z16-LABEL: canonicalize_v8f16: ; Z16: # %bb.0: -; Z16-NEXT: stmg %r13, %r15, 104(%r15) +; Z16-NEXT: stmg %r14, %r15, 112(%r15) ; Z16-NEXT: aghi %r15, -224 -; Z16-NEXT: std %f8, 216(%r15) # 8-byte Spill -; Z16-NEXT: std %f9, 208(%r15) # 8-byte Spill -; Z16-NEXT: std %f10, 200(%r15) # 8-byte Spill -; Z16-NEXT: std %f11, 192(%r15) # 8-byte Spill -; Z16-NEXT: std %f12, 184(%r15) # 8-byte Spill -; Z16-NEXT: std %f13, 176(%r15) # 8-byte Spill -; Z16-NEXT: std %f14, 168(%r15) # 8-byte Spill -; Z16-NEXT: std %f15, 160(%r15) # 8-byte Spill -; Z16-NEXT: vlreph %v11, 414(%r15) -; Z16-NEXT: vlreph %v12, 406(%r15) -; Z16-NEXT: vlreph %v13, 398(%r15) -; Z16-NEXT: vlreph %v14, 390(%r15) -; Z16-NEXT: ldr %f8, %f6 -; Z16-NEXT: ldr %f9, %f4 -; Z16-NEXT: ldr %f10, %f2 -; Z16-NEXT: lgr %r13, %r2 +; Z16-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill +; Z16-NEXT: vreph %v0, %v24, 7 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f15, %f0 -; Z16-NEXT: ldr %f0, %f10 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 6 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f10, %f0 -; Z16-NEXT: ldr %f0, %f9 +; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 5 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f9, %f0 -; Z16-NEXT: ldr %f0, %f8 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 4 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f8, %f0 -; Z16-NEXT: ldr %f0, %f14 +; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v0, %v1 +; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; Z16-NEXT: vmrhf %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 3 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f14, %f0 -; Z16-NEXT: ldr %f0, %f13 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 2 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f13, %f0 -; Z16-NEXT: ldr %f0, %f12 +; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f12, %f0 -; Z16-NEXT: ldr %f0, %f11 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 1 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: vsteh %v0, 14(%r13), 0 -; Z16-NEXT: vsteh %v12, 12(%r13), 0 -; Z16-NEXT: vsteh %v13, 10(%r13), 0 -; Z16-NEXT: vsteh %v14, 8(%r13), 0 -; Z16-NEXT: vsteh %v8, 6(%r13), 0 -; Z16-NEXT: vsteh %v9, 4(%r13), 0 -; Z16-NEXT: vsteh %v10, 2(%r13), 0 -; Z16-NEXT: vsteh %v15, 0(%r13), 0 -; Z16-NEXT: ld %f8, 216(%r15) # 8-byte Reload -; Z16-NEXT: ld %f9, 208(%r15) # 8-byte Reload -; Z16-NEXT: ld %f10, 200(%r15) # 8-byte Reload -; Z16-NEXT: ld %f11, 192(%r15) # 8-byte Reload -; Z16-NEXT: ld %f12, 184(%r15) # 8-byte Reload -; Z16-NEXT: ld %f13, 176(%r15) # 8-byte Reload -; Z16-NEXT: ld %f14, 168(%r15) # 8-byte Reload -; Z16-NEXT: ld %f15, 160(%r15) # 8-byte Reload -; Z16-NEXT: lmg %r13, %r15, 328(%r15) +; Z16-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v1, %v0 +; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload +; Z16-NEXT: vmrhf %v0, %v0, %v1 +; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; Z16-NEXT: vmrhg %v24, %v0, %v1 +; Z16-NEXT: lmg %r14, %r15, 336(%r15) ; Z16-NEXT: br %r14 %canonicalized = call <8 x half> @llvm.canonicalize.v8f16(<8 x half> %a) ret <8 x half> %canonicalized @@ -253,85 +259,93 @@ define void @canonicalize_ptr_v8f16(ptr %out) nounwind { ; Z16: # %bb.0: ; Z16-NEXT: stmg %r13, %r15, 104(%r15) ; Z16-NEXT: aghi %r15, -224 -; Z16-NEXT: std %f8, 216(%r15) # 8-byte Spill -; Z16-NEXT: std %f9, 208(%r15) # 8-byte Spill -; Z16-NEXT: std %f10, 200(%r15) # 8-byte Spill -; Z16-NEXT: std %f11, 192(%r15) # 8-byte Spill -; Z16-NEXT: std %f12, 184(%r15) # 8-byte Spill -; Z16-NEXT: std %f13, 176(%r15) # 8-byte Spill -; Z16-NEXT: std %f14, 168(%r15) # 8-byte Spill -; Z16-NEXT: std %f15, 160(%r15) # 8-byte Spill -; Z16-NEXT: vlreph %v0, 0(%r2) -; Z16-NEXT: vlreph %v8, 14(%r2) -; Z16-NEXT: vlreph %v9, 12(%r2) -; Z16-NEXT: vlreph %v10, 10(%r2) +; Z16-NEXT: vl %v0, 0(%r2), 3 ; Z16-NEXT: lgr %r13, %r2 -; Z16-NEXT: vlreph %v11, 8(%r2) -; Z16-NEXT: vlreph %v12, 6(%r2) -; Z16-NEXT: vlreph %v13, 4(%r2) -; Z16-NEXT: vlreph %v14, 2(%r2) +; Z16-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; Z16-NEXT: vreph %v0, %v0, 7 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f15, %f0 -; Z16-NEXT: ldr %f0, %f14 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 6 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f14, %f0 -; Z16-NEXT: ldr %f0, %f13 +; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 5 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f13, %f0 -; Z16-NEXT: ldr %f0, %f12 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 4 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f12, %f0 -; Z16-NEXT: ldr %f0, %f11 +; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v0, %v1 +; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; Z16-NEXT: vmrhf %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 3 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f11, %f0 -; Z16-NEXT: ldr %f0, %f10 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 2 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f10, %f0 -; Z16-NEXT: ldr %f0, %f9 +; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: ldr %f9, %f0 -; Z16-NEXT: ldr %f0, %f8 +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill +; Z16-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; Z16-NEXT: vreph %v0, %v0, 1 +; Z16-NEXT: # kill: def $f0h killed $f0h killed $v0 ; Z16-NEXT: brasl %r14, __extendhfsf2@PLT ; Z16-NEXT: vgmf %v1, 2, 8 ; Z16-NEXT: meebr %f0, %f1 ; Z16-NEXT: brasl %r14, __truncsfhf2@PLT -; Z16-NEXT: vsteh %v9, 12(%r13), 0 -; Z16-NEXT: vsteh %v10, 10(%r13), 0 -; Z16-NEXT: vsteh %v11, 8(%r13), 0 -; Z16-NEXT: vsteh %v12, 6(%r13), 0 -; Z16-NEXT: vsteh %v13, 4(%r13), 0 -; Z16-NEXT: vsteh %v14, 2(%r13), 0 -; Z16-NEXT: vsteh %v15, 0(%r13), 0 -; Z16-NEXT: ld %f8, 216(%r15) # 8-byte Reload -; Z16-NEXT: ld %f9, 208(%r15) # 8-byte Reload -; Z16-NEXT: ld %f10, 200(%r15) # 8-byte Reload -; Z16-NEXT: ld %f11, 192(%r15) # 8-byte Reload -; Z16-NEXT: ld %f12, 184(%r15) # 8-byte Reload -; Z16-NEXT: ld %f13, 176(%r15) # 8-byte Reload -; Z16-NEXT: ld %f14, 168(%r15) # 8-byte Reload -; Z16-NEXT: ld %f15, 160(%r15) # 8-byte Reload -; Z16-NEXT: vsteh %v0, 14(%r13), 0 +; Z16-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload +; Z16-NEXT: # kill: def $f0h killed $f0h def $v0 +; Z16-NEXT: vmrhh %v0, %v1, %v0 +; Z16-NEXT: vl %v1, 176(%r15), 3 # 16-byte Reload +; Z16-NEXT: vmrhf %v0, %v0, %v1 +; Z16-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; Z16-NEXT: vmrhg %v0, %v0, %v1 +; Z16-NEXT: vst %v0, 0(%r13), 3 ; Z16-NEXT: lmg %r13, %r15, 328(%r15) ; Z16-NEXT: br %r14 %val = load <8 x half>, ptr %out diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll new file mode 100644 index 0000000000000..825472299d028 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-binops.ll @@ -0,0 +1,519 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR +; +; Test some fp16 vector operations, which must be scalarized. With less than +; 8 elements there should only be operations emitted for the used elements. + +%Ty0 = type <8 x half> +define void @fun0(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun0: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -288 +; CHECK-NEXT: .cfi_def_cfa_offset 448 +; CHECK-NEXT: std %f8, 280(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 272(%r15) # 8-byte Spill +; CHECK-NEXT: std %f10, 264(%r15) # 8-byte Spill +; CHECK-NEXT: std %f11, 256(%r15) # 8-byte Spill +; CHECK-NEXT: std %f12, 248(%r15) # 8-byte Spill +; CHECK-NEXT: std %f13, 240(%r15) # 8-byte Spill +; CHECK-NEXT: std %f14, 232(%r15) # 8-byte Spill +; CHECK-NEXT: std %f15, 224(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: .cfi_offset %f10, -184 +; CHECK-NEXT: .cfi_offset %f11, -192 +; CHECK-NEXT: .cfi_offset %f12, -200 +; CHECK-NEXT: .cfi_offset %f13, -208 +; CHECK-NEXT: .cfi_offset %f14, -216 +; CHECK-NEXT: .cfi_offset %f15, -224 +; CHECK-NEXT: lgh %r0, 14(%r2) +; CHECK-NEXT: lgr %r13, %r3 +; CHECK-NEXT: lgh %r1, 12(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: stg %r0, 216(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 10(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: stg %r1, 208(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r1, 8(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: stg %r0, 200(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 6(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: stg %r1, 192(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r1, 4(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: stg %r0, 176(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 2(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: stg %r1, 160(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r1, 0(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f8, %r0 +; CHECK-NEXT: lgh %r0, 30(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f13, %r1 +; CHECK-NEXT: lgh %r1, 28(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: stg %r0, 184(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 26(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: stg %r1, 168(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r1, 24(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgh %r3, 22(%r2) +; CHECK-NEXT: ldgr %f10, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f11, %r0 +; CHECK-NEXT: sllg %r0, %r3, 48 +; CHECK-NEXT: lgh %r1, 20(%r2) +; CHECK-NEXT: ldgr %f12, %r0 +; CHECK-NEXT: lgh %r0, 18(%r2) +; CHECK-NEXT: lgh %r2, 16(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f14, %r1 +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: sllg %r1, %r2, 48 +; CHECK-NEXT: ldgr %f0, %r1 +; CHECK-NEXT: ldgr %f15, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f13 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f9 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f13, %f0 +; CHECK-NEXT: ler %f0, %f15 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f9 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f14 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ld %f0, 160(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f9 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f12 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f12, %f0 +; CHECK-NEXT: ld %f0, 176(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f12 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f14, %f0 +; CHECK-NEXT: ler %f0, %f11 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f11, %f0 +; CHECK-NEXT: ld %f0, 192(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f11 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f11, %f0 +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ld %f0, 200(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f10 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ld %f0, 168(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f12, %f0 +; CHECK-NEXT: ld %f0, 208(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f12 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f12, %f0 +; CHECK-NEXT: ld %f0, 184(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f15, %f0 +; CHECK-NEXT: ld %f0, 216(%r15) # 8-byte Reload +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: aebr %f0, %f15 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 14(%r13) +; CHECK-NEXT: lgdr %r0, %f12 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 12(%r13) +; CHECK-NEXT: lgdr %r0, %f10 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 10(%r13) +; CHECK-NEXT: lgdr %r0, %f11 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 8(%r13) +; CHECK-NEXT: lgdr %r0, %f14 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r13) +; CHECK-NEXT: lgdr %r0, %f9 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r13) +; CHECK-NEXT: lgdr %r0, %f8 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r13) +; CHECK-NEXT: lgdr %r0, %f13 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r13) +; CHECK-NEXT: ld %f8, 280(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 272(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f10, 264(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f11, 256(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f12, 248(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f13, 240(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f14, 232(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f15, 224(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r13, %r15, 392(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun0: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -248 +; VECTOR-NEXT: .cfi_def_cfa_offset 408 +; VECTOR-NEXT: std %f8, 240(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: vl %v0, 16(%r2), 3 +; VECTOR-NEXT: mvc 160(16,%r15), 0(%r2) # 16-byte Folded Spill +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: vst %v0, 176(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vreph %v0, %v0, 7 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 7 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 6 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 6 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vmrhh %v0, %v0, %v1 +; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 5 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 5 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 4 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 4 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vmrhh %v0, %v0, %v1 +; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vmrhf %v0, %v0, %v1 +; VECTOR-NEXT: vst %v0, 208(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 3 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 3 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 2 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 2 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vmrhh %v0, %v0, %v1 +; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vst %v0, 224(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 1 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 1 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: aebr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vl %v1, 224(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vmrhh %v0, %v1, %v0 +; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vmrhf %v0, %v0, %v1 +; VECTOR-NEXT: vl %v1, 208(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: ld %f8, 240(%r15) # 8-byte Reload +; VECTOR-NEXT: vmrhg %v0, %v0, %v1 +; VECTOR-NEXT: vst %v0, 0(%r13), 3 +; VECTOR-NEXT: lmg %r13, %r15, 352(%r15) +; VECTOR-NEXT: br %r14 + %LHS = load %Ty0, ptr %Src + %S2 = getelementptr %Ty0, ptr %Src, i32 1 + %RHS = load %Ty0, ptr %S2 + %Res = fadd %Ty0 %LHS, %RHS + store %Ty0 %Res, ptr %Dst + ret void +} + +%Ty1 = type <4 x half> +define void @fun1(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun1: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -224 +; CHECK-NEXT: .cfi_def_cfa_offset 384 +; CHECK-NEXT: std %f8, 216(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 208(%r15) # 8-byte Spill +; CHECK-NEXT: std %f10, 200(%r15) # 8-byte Spill +; CHECK-NEXT: std %f11, 192(%r15) # 8-byte Spill +; CHECK-NEXT: std %f12, 184(%r15) # 8-byte Spill +; CHECK-NEXT: std %f13, 176(%r15) # 8-byte Spill +; CHECK-NEXT: std %f14, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f15, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: .cfi_offset %f10, -184 +; CHECK-NEXT: .cfi_offset %f11, -192 +; CHECK-NEXT: .cfi_offset %f12, -200 +; CHECK-NEXT: .cfi_offset %f13, -208 +; CHECK-NEXT: .cfi_offset %f14, -216 +; CHECK-NEXT: .cfi_offset %f15, -224 +; CHECK-NEXT: lgh %r0, 6(%r2) +; CHECK-NEXT: lgr %r13, %r3 +; CHECK-NEXT: lgh %r1, 4(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f8, %r0 +; CHECK-NEXT: lgh %r0, 2(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f9, %r1 +; CHECK-NEXT: lgh %r1, 0(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgh %r3, 14(%r2) +; CHECK-NEXT: ldgr %f12, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f10, %r0 +; CHECK-NEXT: sllg %r0, %r3, 48 +; CHECK-NEXT: lgh %r1, 12(%r2) +; CHECK-NEXT: ldgr %f11, %r0 +; CHECK-NEXT: lgh %r0, 10(%r2) +; CHECK-NEXT: lgh %r2, 8(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f13, %r1 +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: sllg %r1, %r2, 48 +; CHECK-NEXT: ldgr %f0, %r1 +; CHECK-NEXT: ldgr %f14, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f15, %f0 +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: sebr %f0, %f15 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ler %f0, %f14 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f14, %f0 +; CHECK-NEXT: ler %f0, %f12 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: sebr %f0, %f14 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f12, %f0 +; CHECK-NEXT: ler %f0, %f13 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f13, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: sebr %f0, %f13 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f11 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f11, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: sebr %f0, %f11 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r13) +; CHECK-NEXT: lgdr %r0, %f9 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r13) +; CHECK-NEXT: lgdr %r0, %f12 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r13) +; CHECK-NEXT: lgdr %r0, %f10 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r13) +; CHECK-NEXT: ld %f8, 216(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 208(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f10, 200(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f11, 192(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f12, 184(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f13, 176(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f14, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f15, 160(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r13, %r15, 328(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -224 +; VECTOR-NEXT: .cfi_def_cfa_offset 384 +; VECTOR-NEXT: std %f8, 216(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f9, 208(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f10, 200(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f11, 192(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f12, 184(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f13, 176(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f14, 168(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f15, 160(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: .cfi_offset %f10, -184 +; VECTOR-NEXT: .cfi_offset %f11, -192 +; VECTOR-NEXT: .cfi_offset %f12, -200 +; VECTOR-NEXT: .cfi_offset %f13, -208 +; VECTOR-NEXT: .cfi_offset %f14, -216 +; VECTOR-NEXT: .cfi_offset %f15, -224 +; VECTOR-NEXT: vlreph %v0, 8(%r2) +; VECTOR-NEXT: vlreph %v8, 6(%r2) +; VECTOR-NEXT: vlreph %v9, 4(%r2) +; VECTOR-NEXT: vlreph %v10, 2(%r2) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: vlreph %v11, 0(%r2) +; VECTOR-NEXT: vlreph %v12, 14(%r2) +; VECTOR-NEXT: vlreph %v13, 12(%r2) +; VECTOR-NEXT: vlreph %v14, 10(%r2) +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f15, %f0 +; VECTOR-NEXT: ldr %f0, %f11 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: sebr %f0, %f15 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f11, %f0 +; VECTOR-NEXT: ldr %f0, %f14 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f14, %f0 +; VECTOR-NEXT: ldr %f0, %f10 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: sebr %f0, %f14 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f13 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f13, %f0 +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: sebr %f0, %f13 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f12 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f12, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: sebr %f0, %f12 +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vsteh %v9, 4(%r13), 0 +; VECTOR-NEXT: vsteh %v10, 2(%r13), 0 +; VECTOR-NEXT: vsteh %v11, 0(%r13), 0 +; VECTOR-NEXT: ld %f8, 216(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f9, 208(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f10, 200(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f11, 192(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f12, 184(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f13, 176(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f14, 168(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f15, 160(%r15) # 8-byte Reload +; VECTOR-NEXT: vsteh %v0, 6(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 328(%r15) +; VECTOR-NEXT: br %r14 + %LHS = load %Ty1, ptr %Src + %S2 = getelementptr %Ty1, ptr %Src, i32 1 + %RHS = load %Ty1, ptr %S2 + %Res = fsub %Ty1 %LHS, %RHS + store %Ty1 %Res, ptr %Dst + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll new file mode 100644 index 0000000000000..fac8a64be28f6 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-conv.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR +; +; Test conversions between different-sized float elements. + +; Test cases where both elements of a v2f64 are converted to f16s. +define void @f1(<2 x double> %val, ptr %ptr) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: lgr %r13, %r2 +; CHECK-NEXT: ldr %f8, %f2 +; CHECK-NEXT: brasl %r14, __truncdfhf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ldr %f0, %f8 +; CHECK-NEXT: brasl %r14, __truncdfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r13) +; CHECK-NEXT: lgdr %r0, %f9 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r13) +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r13, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -184 +; VECTOR-NEXT: .cfi_def_cfa_offset 344 +; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: lgr %r13, %r2 +; VECTOR-NEXT: vst %v24, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vrepg %v0, %v24, 1 +; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: vsteh %v8, 2(%r13), 0 +; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Reload +; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 288(%r15) +; VECTOR-NEXT: br %r14 + %res = fptrunc <2 x double> %val to <2 x half> + store <2 x half> %res, ptr %ptr + ret void +} + +; Test conversion of an f64 in a vector register to an f16. +define half @f2(<2 x double> %vec) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __truncdfhf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f2: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: vlr %v0, %v24 +; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 +; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT +; VECTOR-NEXT: lmg %r14, %r15, 272(%r15) +; VECTOR-NEXT: br %r14 + %scalar = extractelement <2 x double> %vec, i32 0 + %ret = fptrunc double %scalar to half + ret half %ret +} + +; Test cases where even elements of a v4f16 are converted to f64s. +define <2 x double> @f3(<4 x half> %vec) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -176 +; CHECK-NEXT: .cfi_def_cfa_offset 336 +; CHECK-NEXT: std %f8, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: ler %f8, %f4 +; CHECK-NEXT: brasl %r14, __extendhfdf2@PLT +; CHECK-NEXT: ldr %f9, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfdf2@PLT +; CHECK-NEXT: ldr %f2, %f0 +; CHECK-NEXT: ldr %f0, %f9 +; CHECK-NEXT: ld %f8, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 160(%r15) # 8-byte Reload +; CHECK-NEXT: lmg %r14, %r15, 288(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f3: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -192 +; VECTOR-NEXT: .cfi_def_cfa_offset 352 +; VECTOR-NEXT: vreph %v1, %v24, 2 +; VECTOR-NEXT: vlr %v0, %v24 +; VECTOR-NEXT: vst %v1, 176(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT +; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0 +; VECTOR-NEXT: vmrhg %v24, %v1, %v0 +; VECTOR-NEXT: lmg %r14, %r15, 304(%r15) +; VECTOR-NEXT: br %r14 + %shuffle = shufflevector <4 x half> %vec, <4 x half> %vec, <2 x i32> + %res = fpext <2 x half> %shuffle to <2 x double> + ret <2 x double> %res +} + +; Test conversion of an f16 in a vector register to an f32, constant element index. +define float @f4(<4 x half> %vec) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f4: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -160 +; VECTOR-NEXT: .cfi_def_cfa_offset 320 +; VECTOR-NEXT: vlr %v0, %v24 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: lmg %r14, %r15, 272(%r15) +; VECTOR-NEXT: br %r14 + %scalar = extractelement <4 x half> %vec, i32 0 + %ret = fpext half %scalar to float + ret float %ret +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-select.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-select.ll new file mode 100644 index 0000000000000..0500f43b7f33e --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-fcmp-select.ll @@ -0,0 +1,503 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR +; +; Test fcmp and select with fp16 vectors. + +; Use of vsel with full vector. +%Ty0 = type <8 x half> +define void @fun0(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun0: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r6, %r15, 48(%r15) +; CHECK-NEXT: .cfi_offset %r6, -112 +; CHECK-NEXT: .cfi_offset %r7, -104 +; CHECK-NEXT: .cfi_offset %r8, -96 +; CHECK-NEXT: .cfi_offset %r9, -88 +; CHECK-NEXT: .cfi_offset %r10, -80 +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r12, -64 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -272 +; CHECK-NEXT: .cfi_def_cfa_offset 432 +; CHECK-NEXT: std %f8, 264(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 256(%r15) # 8-byte Spill +; CHECK-NEXT: std %f10, 248(%r15) # 8-byte Spill +; CHECK-NEXT: std %f11, 240(%r15) # 8-byte Spill +; CHECK-NEXT: std %f12, 232(%r15) # 8-byte Spill +; CHECK-NEXT: std %f13, 224(%r15) # 8-byte Spill +; CHECK-NEXT: std %f14, 216(%r15) # 8-byte Spill +; CHECK-NEXT: std %f15, 208(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: .cfi_offset %f10, -184 +; CHECK-NEXT: .cfi_offset %f11, -192 +; CHECK-NEXT: .cfi_offset %f12, -200 +; CHECK-NEXT: .cfi_offset %f13, -208 +; CHECK-NEXT: .cfi_offset %f14, -216 +; CHECK-NEXT: .cfi_offset %f15, -224 +; CHECK-NEXT: lgh %r0, 14(%r2) +; CHECK-NEXT: stg %r0, 200(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 12(%r2) +; CHECK-NEXT: stg %r0, 160(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 6(%r2) +; CHECK-NEXT: sllg %r12, %r0, 48 +; CHECK-NEXT: lgh %r0, 4(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f10, %r0 +; CHECK-NEXT: lgh %r0, 2(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f9, %r0 +; CHECK-NEXT: lgh %r0, 0(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f12, %r0 +; CHECK-NEXT: lgh %r0, 30(%r2) +; CHECK-NEXT: stg %r0, 192(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 28(%r2) +; CHECK-NEXT: stg %r0, 184(%r15) # 8-byte Spill +; CHECK-NEXT: lgh %r0, 22(%r2) +; CHECK-NEXT: sllg %r10, %r0, 48 +; CHECK-NEXT: lgh %r0, 20(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f13, %r0 +; CHECK-NEXT: lgh %r0, 18(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f8, %r0 +; CHECK-NEXT: lgh %r0, 16(%r2) +; CHECK-NEXT: lgh %r8, 10(%r2) +; CHECK-NEXT: lgh %r6, 8(%r2) +; CHECK-NEXT: lgh %r7, 26(%r2) +; CHECK-NEXT: lgh %r11, 24(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgr %r13, %r3 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f11, %f0 +; CHECK-NEXT: ler %f0, %f12 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f11 +; CHECK-NEXT: je .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ler %f0, %f11 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: sllg %r6, %r6, 48 +; CHECK-NEXT: sllg %r9, %r11, 48 +; CHECK-NEXT: ldgr %f11, %r12 +; CHECK-NEXT: ldgr %f15, %r10 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: std %f0, 176(%r15) # 8-byte Spill +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f8 +; CHECK-NEXT: je .LBB0_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: .LBB0_4: +; CHECK-NEXT: sllg %r11, %r8, 48 +; CHECK-NEXT: sllg %r8, %r7, 48 +; CHECK-NEXT: ldgr %f12, %r6 +; CHECK-NEXT: ldgr %f14, %r9 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: std %f0, 168(%r15) # 8-byte Spill +; CHECK-NEXT: ler %f0, %f13 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f8 +; CHECK-NEXT: je .LBB0_6 +; CHECK-NEXT: # %bb.5: +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: .LBB0_6: +; CHECK-NEXT: lg %r0, 160(%r15) # 8-byte Reload +; CHECK-NEXT: sllg %r12, %r0, 48 +; CHECK-NEXT: lg %r0, 184(%r15) # 8-byte Reload +; CHECK-NEXT: sllg %r10, %r0, 48 +; CHECK-NEXT: ldgr %f13, %r11 +; CHECK-NEXT: ldgr %f8, %r8 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: std %f0, 160(%r15) # 8-byte Spill +; CHECK-NEXT: ler %f0, %f15 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f11 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f9 +; CHECK-NEXT: je .LBB0_8 +; CHECK-NEXT: # %bb.7: +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: .LBB0_8: +; CHECK-NEXT: lg %r0, 200(%r15) # 8-byte Reload +; CHECK-NEXT: sllg %r11, %r0, 48 +; CHECK-NEXT: lg %r0, 192(%r15) # 8-byte Reload +; CHECK-NEXT: sllg %r9, %r0, 48 +; CHECK-NEXT: ldgr %f15, %r12 +; CHECK-NEXT: ldgr %f9, %r10 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f11, %f0 +; CHECK-NEXT: ler %f0, %f14 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ler %f0, %f12 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f10 +; CHECK-NEXT: je .LBB0_10 +; CHECK-NEXT: # %bb.9: +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: .LBB0_10: +; CHECK-NEXT: ldgr %f14, %r11 +; CHECK-NEXT: ldgr %f10, %r9 +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f12, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f13 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f8 +; CHECK-NEXT: je .LBB0_12 +; CHECK-NEXT: # %bb.11: +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: .LBB0_12: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f8, %f0 +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f15 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f9 +; CHECK-NEXT: je .LBB0_14 +; CHECK-NEXT: # %bb.13: +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: .LBB0_14: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ler %f0, %f14 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f10 +; CHECK-NEXT: je .LBB0_16 +; CHECK-NEXT: # %bb.15: +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: .LBB0_16: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 14(%r13) +; CHECK-NEXT: lgdr %r0, %f9 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 12(%r13) +; CHECK-NEXT: lgdr %r0, %f8 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 10(%r13) +; CHECK-NEXT: lgdr %r0, %f12 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 8(%r13) +; CHECK-NEXT: lgdr %r0, %f11 +; CHECK-NEXT: ld %f8, 264(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 256(%r15) # 8-byte Reload +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: ld %f10, 248(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f11, 240(%r15) # 8-byte Reload +; CHECK-NEXT: sth %r0, 6(%r13) +; CHECK-NEXT: lg %r0, 160(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f12, 232(%r15) # 8-byte Reload +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: ld %f13, 224(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f14, 216(%r15) # 8-byte Reload +; CHECK-NEXT: sth %r0, 4(%r13) +; CHECK-NEXT: lg %r0, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f15, 208(%r15) # 8-byte Reload +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r13) +; CHECK-NEXT: lg %r0, 176(%r15) # 8-byte Reload +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r13) +; CHECK-NEXT: lmg %r6, %r15, 320(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun0: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r11, %r15, 88(%r15) +; VECTOR-NEXT: .cfi_offset %r11, -72 +; VECTOR-NEXT: .cfi_offset %r12, -64 +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -216 +; VECTOR-NEXT: .cfi_def_cfa_offset 376 +; VECTOR-NEXT: std %f8, 208(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: vl %v0, 16(%r2), 3 +; VECTOR-NEXT: mvc 176(16,%r15), 0(%r2) # 16-byte Folded Spill +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: vst %v0, 192(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vreph %v0, %v0, 7 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 7 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lhi %r11, 0 +; VECTOR-NEXT: lhi %r12, 0 +; VECTOR-NEXT: lochie %r11, -1 +; VECTOR-NEXT: vreph %v0, %v0, 3 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 3 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: lhi %r0, 0 +; VECTOR-NEXT: lochie %r0, -1 +; VECTOR-NEXT: vlvgp %v0, %r0, %r11 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lhi %r0, 0 +; VECTOR-NEXT: lochie %r0, -1 +; VECTOR-NEXT: vlvgh %v0, %r0, 0 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 1 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 1 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lhi %r0, 0 +; VECTOR-NEXT: lochie %r0, -1 +; VECTOR-NEXT: vlvgh %v0, %r0, 1 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 2 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 2 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lhi %r0, 0 +; VECTOR-NEXT: lochie %r0, -1 +; VECTOR-NEXT: vlvgh %v0, %r0, 2 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 4 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 4 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lhi %r0, 0 +; VECTOR-NEXT: lochie %r0, -1 +; VECTOR-NEXT: vlvgh %v0, %r0, 4 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 5 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 5 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lhi %r0, 0 +; VECTOR-NEXT: lochie %r0, -1 +; VECTOR-NEXT: vlvgh %v0, %r0, 5 +; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill +; VECTOR-NEXT: vl %v0, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 6 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f8, %f0 +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vreph %v0, %v0, 6 +; VECTOR-NEXT: # kill: def $f0h killed $f0h killed $v0 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f8 +; VECTOR-NEXT: vl %v2, 160(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vl %v0, 176(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: vl %v1, 192(%r15), 3 # 16-byte Reload +; VECTOR-NEXT: lochie %r12, -1 +; VECTOR-NEXT: vlvgh %v2, %r12, 6 +; VECTOR-NEXT: ld %f8, 208(%r15) # 8-byte Reload +; VECTOR-NEXT: vsel %v0, %v0, %v1, %v2 +; VECTOR-NEXT: vst %v0, 0(%r13), 3 +; VECTOR-NEXT: lmg %r11, %r15, 304(%r15) +; VECTOR-NEXT: br %r14 + %A = load %Ty0, ptr %Src + %S2 = getelementptr %Ty0, ptr %Src, i32 1 + %B = load %Ty0, ptr %S2 + %C = fcmp oeq %Ty0 %A, %B + %S = select <8 x i1> %C, %Ty0 %A, %Ty0 %B + store %Ty0 %S, ptr %Dst + ret void +} + +%Ty1 = type <2 x half> +define void @fun1(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun1: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: std %f8, 184(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 176(%r15) # 8-byte Spill +; CHECK-NEXT: std %f10, 168(%r15) # 8-byte Spill +; CHECK-NEXT: std %f11, 160(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: .cfi_offset %f10, -184 +; CHECK-NEXT: .cfi_offset %f11, -192 +; CHECK-NEXT: lgh %r0, 2(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f8, %r0 +; CHECK-NEXT: lgh %r0, 0(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f11, %r0 +; CHECK-NEXT: lgh %r0, 6(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f10, %r0 +; CHECK-NEXT: lgh %r0, 4(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgr %r13, %r3 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: # kill: def $f0h killed $f0h killed $f0d +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f11 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f9 +; CHECK-NEXT: je .LBB1_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ler %f0, %f9 +; CHECK-NEXT: .LBB1_2: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: ler %f9, %f0 +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: ler %f10, %f0 +; CHECK-NEXT: ler %f0, %f8 +; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT +; CHECK-NEXT: cebr %f0, %f10 +; CHECK-NEXT: je .LBB1_4 +; CHECK-NEXT: # %bb.3: +; CHECK-NEXT: ler %f0, %f10 +; CHECK-NEXT: .LBB1_4: +; CHECK-NEXT: brasl %r14, __truncsfhf2@PLT +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r13) +; CHECK-NEXT: lgdr %r0, %f9 +; CHECK-NEXT: ld %f8, 184(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 176(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f10, 168(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f11, 160(%r15) # 8-byte Reload +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r13) +; CHECK-NEXT: lmg %r13, %r15, 296(%r15) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) +; VECTOR-NEXT: .cfi_offset %r13, -56 +; VECTOR-NEXT: .cfi_offset %r14, -48 +; VECTOR-NEXT: .cfi_offset %r15, -40 +; VECTOR-NEXT: aghi %r15, -192 +; VECTOR-NEXT: .cfi_def_cfa_offset 352 +; VECTOR-NEXT: std %f8, 184(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f9, 176(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f10, 168(%r15) # 8-byte Spill +; VECTOR-NEXT: std %f11, 160(%r15) # 8-byte Spill +; VECTOR-NEXT: .cfi_offset %f8, -168 +; VECTOR-NEXT: .cfi_offset %f9, -176 +; VECTOR-NEXT: .cfi_offset %f10, -184 +; VECTOR-NEXT: .cfi_offset %f11, -192 +; VECTOR-NEXT: vlreph %v0, 4(%r2) +; VECTOR-NEXT: vlreph %v8, 2(%r2) +; VECTOR-NEXT: vlreph %v11, 0(%r2) +; VECTOR-NEXT: vlreph %v9, 6(%r2) +; VECTOR-NEXT: lgr %r13, %r3 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f11 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f10 +; VECTOR-NEXT: je .LBB1_2 +; VECTOR-NEXT: # %bb.1: +; VECTOR-NEXT: ldr %f0, %f10 +; VECTOR-NEXT: .LBB1_2: +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: ldr %f10, %f0 +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: ldr %f9, %f0 +; VECTOR-NEXT: ldr %f0, %f8 +; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT +; VECTOR-NEXT: cebr %f0, %f9 +; VECTOR-NEXT: je .LBB1_4 +; VECTOR-NEXT: # %bb.3: +; VECTOR-NEXT: ldr %f0, %f9 +; VECTOR-NEXT: .LBB1_4: +; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT +; VECTOR-NEXT: vsteh %v10, 0(%r13), 0 +; VECTOR-NEXT: ld %f8, 184(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f9, 176(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f10, 168(%r15) # 8-byte Reload +; VECTOR-NEXT: ld %f11, 160(%r15) # 8-byte Reload +; VECTOR-NEXT: vsteh %v0, 2(%r13), 0 +; VECTOR-NEXT: lmg %r13, %r15, 296(%r15) +; VECTOR-NEXT: br %r14 + %A = load %Ty1, ptr %Src + %S2 = getelementptr %Ty1, ptr %Src, i32 1 + %B = load %Ty1, ptr %S2 + %C = fcmp oeq %Ty1 %A, %B + %S = select <2 x i1> %C, %Ty1 %A, %Ty1 %B + store %Ty1 %S, ptr %Dst + ret void +} + diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll new file mode 100644 index 0000000000000..2c8a69ec1e2c9 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-mem.ll @@ -0,0 +1,126 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR +; +; Test loading-and-store fp16 vectors. + +define void @fun0(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun0: +; CHECK: # %bb.0: +; CHECK-NEXT: lgh %r0, 0(%r2) +; CHECK-NEXT: lgh %r1, 2(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgh %r0, 4(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f1, %r1 +; CHECK-NEXT: lgh %r1, 6(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f2, %r0 +; CHECK-NEXT: lgh %r0, 8(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f3, %r1 +; CHECK-NEXT: lgh %r1, 10(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f4, %r0 +; CHECK-NEXT: lgh %r0, 12(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: lgh %r2, 14(%r2) +; CHECK-NEXT: ldgr %f5, %r1 +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f6, %r0 +; CHECK-NEXT: sllg %r0, %r2, 48 +; CHECK-NEXT: ldgr %f7, %r0 +; CHECK-NEXT: lgdr %r0, %f7 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 14(%r3) +; CHECK-NEXT: lgdr %r0, %f6 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 12(%r3) +; CHECK-NEXT: lgdr %r0, %f5 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 10(%r3) +; CHECK-NEXT: lgdr %r0, %f4 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 8(%r3) +; CHECK-NEXT: lgdr %r0, %f3 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r3) +; CHECK-NEXT: lgdr %r0, %f2 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r3) +; CHECK-NEXT: lgdr %r0, %f1 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r3) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r3) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun0: +; VECTOR: # %bb.0: +; VECTOR-NEXT: vl %v0, 0(%r2), 3 +; VECTOR-NEXT: vst %v0, 0(%r3), 3 +; VECTOR-NEXT: br %r14 + %L = load <8 x half>, ptr %Src + store <8 x half> %L, ptr %Dst + ret void +} + +define void @fun1(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun1: +; CHECK: # %bb.0: +; CHECK-NEXT: lgh %r0, 4(%r2) +; CHECK-NEXT: lgh %r1, 6(%r2) +; CHECK-NEXT: l %r2, 0(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f1, %r0 +; CHECK-NEXT: st %r2, 0(%r3) +; CHECK-NEXT: lgdr %r0, %f1 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r3) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r3) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: l %r0, 0(%r2) +; VECTOR-NEXT: vlreph %v0, 4(%r2) +; VECTOR-NEXT: vlreph %v1, 6(%r2) +; VECTOR-NEXT: vsteh %v1, 6(%r3), 0 +; VECTOR-NEXT: vsteh %v0, 4(%r3), 0 +; VECTOR-NEXT: st %r0, 0(%r3) +; VECTOR-NEXT: br %r14 + %L = load <4 x half>, ptr %Src + store <4 x half> %L, ptr %Dst + ret void +} + +define void @fun2(ptr %Src, ptr %Dst) { +; CHECK-LABEL: fun2: +; CHECK: # %bb.0: +; CHECK-NEXT: lg %r0, 24(%r2) +; CHECK-NEXT: lg %r1, 16(%r2) +; CHECK-NEXT: lg %r4, 8(%r2) +; CHECK-NEXT: lg %r2, 0(%r2) +; CHECK-NEXT: stg %r0, 24(%r3) +; CHECK-NEXT: stg %r1, 16(%r3) +; CHECK-NEXT: stg %r4, 8(%r3) +; CHECK-NEXT: stg %r2, 0(%r3) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: fun2: +; VECTOR: # %bb.0: +; VECTOR-NEXT: vl %v0, 16(%r2), 4 +; VECTOR-NEXT: vl %v1, 0(%r2), 4 +; VECTOR-NEXT: vst %v1, 0(%r3), 4 +; VECTOR-NEXT: vst %v0, 16(%r3), 4 +; VECTOR-NEXT: br %r14 + %L = load <16 x half>, ptr %Src + store <16 x half> %L, ptr %Dst + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector-move.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector-move.ll new file mode 100644 index 0000000000000..e1daffecc374c --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fp-half-vector-move.ll @@ -0,0 +1,122 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s --check-prefix=VECTOR +; +; Test insertions into fp16 vectors. + +define <8 x half> @f0(half %val) { +; CHECK-LABEL: f0: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r2) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f0: +; VECTOR: # %bb.0: +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vreph %v24, %v0, 0 +; VECTOR-NEXT: br %r14 + %ret = insertelement <8 x half> poison, half %val, i32 2 + ret <8 x half> %ret +} + +define <8 x half> @f1(half %val) { +; CHECK-LABEL: f1: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r2) +; CHECK-NEXT: sth %r0, 4(%r2) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f1: +; VECTOR: # %bb.0: +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vreph %v24, %v0, 0 +; VECTOR-NEXT: br %r14 + %v0 = insertelement <8 x half> poison, half %val, i32 2 + %ret = insertelement <8 x half> %v0, half %val, i32 3 + ret <8 x half> %ret +} + +define <8 x half> @f2(half %val0, half %val1) { +; CHECK-LABEL: f2: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f2 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r2) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r2) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f2: +; VECTOR: # %bb.0: +; VECTOR-NEXT: # kill: def $f2h killed $f2h def $v2 +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vmrhh %v0, %v0, %v2 +; VECTOR-NEXT: vmrhf %v0, %v0, %v0 +; VECTOR-NEXT: vmrhg %v24, %v0, %v0 +; VECTOR-NEXT: br %r14 + %v0 = insertelement <8 x half> poison, half %val0, i32 2 + %ret = insertelement <8 x half> %v0, half %val1, i32 3 + ret <8 x half> %ret +} + +define <8 x half> @f3(half %val0, half %val1) { +; CHECK-LABEL: f3: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d +; CHECK-NEXT: # kill: def $f0h killed $f0h def $f0d +; CHECK-NEXT: lgdr %r0, %f2 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 10(%r2) +; CHECK-NEXT: lgdr %r1, %f0 +; CHECK-NEXT: srlg %r1, %r1, 48 +; CHECK-NEXT: sth %r1, 8(%r2) +; CHECK-NEXT: sth %r0, 6(%r2) +; CHECK-NEXT: sth %r1, 4(%r2) +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f3: +; VECTOR: # %bb.0: +; VECTOR-NEXT: # kill: def $f2h killed $f2h def $v2 +; VECTOR-NEXT: # kill: def $f0h killed $f0h def $v0 +; VECTOR-NEXT: vmrhh %v0, %v0, %v2 +; VECTOR-NEXT: vmrhf %v0, %v0, %v0 +; VECTOR-NEXT: vmrhg %v24, %v0, %v0 +; VECTOR-NEXT: br %r14 + %v0 = insertelement <8 x half> poison, half %val0, i32 2 + %v1 = insertelement <8 x half> %v0, half %val1, i32 3 + %v2 = insertelement <8 x half> %v1, half %val0, i32 4 + %ret = insertelement <8 x half> %v2, half %val1, i32 5 + ret <8 x half> %ret +} + +; Test creation of vregs where the arg gets one VR128 which is split into two +; VR16 parts. +define <2 x half> @f4(<2 x half> %0) { +; CHECK-LABEL: f4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lzer %f0 +; CHECK-NEXT: br %r14 +; +; VECTOR-LABEL: f4: +; VECTOR: # %bb.0: # %entry +; VECTOR-NEXT: vreph %v0, %v24, 1 +; VECTOR-NEXT: vuplhh %v0, %v0 +; VECTOR-NEXT: vmrhf %v0, %v0, %v0 +; VECTOR-NEXT: vmrhg %v24, %v0, %v0 +; VECTOR-NEXT: br %r14 +entry: + br label %body + +body: ; preds = %entry + %2 = insertelement <2 x half> %0, half 0x0000, i64 0 + ret <2 x half> %2 +} diff --git a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll b/llvm/test/CodeGen/SystemZ/fp-half-vector.ll deleted file mode 100644 index 4997c5b0c617d..0000000000000 --- a/llvm/test/CodeGen/SystemZ/fp-half-vector.ll +++ /dev/null @@ -1,725 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs \ -; RUN: | FileCheck %s --check-prefix=NOVEC -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 -verify-machineinstrs \ -; RUN: | FileCheck %s --check-prefix=VECTOR - -; Add the <8 x half> argument with itself and return it. -define <8 x half> @fun0(<8 x half> %Op) { -; NOVEC-LABEL: fun0: -; NOVEC: # %bb.0: # %entry -; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) -; NOVEC-NEXT: .cfi_offset %r13, -56 -; NOVEC-NEXT: .cfi_offset %r14, -48 -; NOVEC-NEXT: .cfi_offset %r15, -40 -; NOVEC-NEXT: aghi %r15, -224 -; NOVEC-NEXT: .cfi_def_cfa_offset 384 -; NOVEC-NEXT: std %f8, 216(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f9, 208(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f10, 200(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f11, 192(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f12, 184(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f13, 176(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f14, 168(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f15, 160(%r15) # 8-byte Spill -; NOVEC-NEXT: .cfi_offset %f8, -168 -; NOVEC-NEXT: .cfi_offset %f9, -176 -; NOVEC-NEXT: .cfi_offset %f10, -184 -; NOVEC-NEXT: .cfi_offset %f11, -192 -; NOVEC-NEXT: .cfi_offset %f12, -200 -; NOVEC-NEXT: .cfi_offset %f13, -208 -; NOVEC-NEXT: .cfi_offset %f14, -216 -; NOVEC-NEXT: .cfi_offset %f15, -224 -; NOVEC-NEXT: lgh %r0, 414(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f14, %r0 -; NOVEC-NEXT: lgh %r0, 406(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f12, %r0 -; NOVEC-NEXT: lgh %r0, 398(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f9, %r0 -; NOVEC-NEXT: lgh %r0, 390(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ler %f10, %f6 -; NOVEC-NEXT: ler %f11, %f4 -; NOVEC-NEXT: ler %f13, %f2 -; NOVEC-NEXT: ler %f15, %f0 -; NOVEC-NEXT: lgr %r13, %r2 -; NOVEC-NEXT: ldgr %f0, %r0 -; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f8, %f0 -; NOVEC-NEXT: ler %f0, %f9 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f9, %f0 -; NOVEC-NEXT: ler %f0, %f12 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f12, %f0 -; NOVEC-NEXT: ler %f0, %f14 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f14, %f0 -; NOVEC-NEXT: ler %f0, %f15 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f15, %f0 -; NOVEC-NEXT: ler %f0, %f13 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f13, %f0 -; NOVEC-NEXT: ler %f0, %f11 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f11, %f0 -; NOVEC-NEXT: ler %f0, %f10 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: # kill: def $f0h killed $f0h def $f0d -; NOVEC-NEXT: lgdr %r0, %f0 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 6(%r13) -; NOVEC-NEXT: lgdr %r0, %f11 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 4(%r13) -; NOVEC-NEXT: lgdr %r0, %f13 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 2(%r13) -; NOVEC-NEXT: lgdr %r0, %f15 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 0(%r13) -; NOVEC-NEXT: lgdr %r0, %f14 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 14(%r13) -; NOVEC-NEXT: lgdr %r0, %f12 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 12(%r13) -; NOVEC-NEXT: lgdr %r0, %f9 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 10(%r13) -; NOVEC-NEXT: lgdr %r0, %f8 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 8(%r13) -; NOVEC-NEXT: ld %f8, 216(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f9, 208(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f10, 200(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f11, 192(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f12, 184(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f13, 176(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f14, 168(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f15, 160(%r15) # 8-byte Reload -; NOVEC-NEXT: lmg %r13, %r15, 328(%r15) -; NOVEC-NEXT: br %r14 -; -; VECTOR-LABEL: fun0: -; VECTOR: # %bb.0: # %entry -; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) -; VECTOR-NEXT: .cfi_offset %r13, -56 -; VECTOR-NEXT: .cfi_offset %r14, -48 -; VECTOR-NEXT: .cfi_offset %r15, -40 -; VECTOR-NEXT: aghi %r15, -224 -; VECTOR-NEXT: .cfi_def_cfa_offset 384 -; VECTOR-NEXT: std %f8, 216(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f9, 208(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f10, 200(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f11, 192(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f12, 184(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f13, 176(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f14, 168(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f15, 160(%r15) # 8-byte Spill -; VECTOR-NEXT: .cfi_offset %f8, -168 -; VECTOR-NEXT: .cfi_offset %f9, -176 -; VECTOR-NEXT: .cfi_offset %f10, -184 -; VECTOR-NEXT: .cfi_offset %f11, -192 -; VECTOR-NEXT: .cfi_offset %f12, -200 -; VECTOR-NEXT: .cfi_offset %f13, -208 -; VECTOR-NEXT: .cfi_offset %f14, -216 -; VECTOR-NEXT: .cfi_offset %f15, -224 -; VECTOR-NEXT: vlreph %v11, 414(%r15) -; VECTOR-NEXT: vlreph %v12, 406(%r15) -; VECTOR-NEXT: vlreph %v13, 398(%r15) -; VECTOR-NEXT: vlreph %v14, 390(%r15) -; VECTOR-NEXT: ldr %f8, %f6 -; VECTOR-NEXT: ldr %f9, %f4 -; VECTOR-NEXT: ldr %f10, %f2 -; VECTOR-NEXT: lgr %r13, %r2 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f15, %f0 -; VECTOR-NEXT: ldr %f0, %f10 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f10, %f0 -; VECTOR-NEXT: ldr %f0, %f9 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f9, %f0 -; VECTOR-NEXT: ldr %f0, %f8 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f8, %f0 -; VECTOR-NEXT: ldr %f0, %f14 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f14, %f0 -; VECTOR-NEXT: ldr %f0, %f13 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f13, %f0 -; VECTOR-NEXT: ldr %f0, %f12 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f12, %f0 -; VECTOR-NEXT: ldr %f0, %f11 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: vsteh %v0, 14(%r13), 0 -; VECTOR-NEXT: vsteh %v12, 12(%r13), 0 -; VECTOR-NEXT: vsteh %v13, 10(%r13), 0 -; VECTOR-NEXT: vsteh %v14, 8(%r13), 0 -; VECTOR-NEXT: vsteh %v8, 6(%r13), 0 -; VECTOR-NEXT: vsteh %v9, 4(%r13), 0 -; VECTOR-NEXT: vsteh %v10, 2(%r13), 0 -; VECTOR-NEXT: vsteh %v15, 0(%r13), 0 -; VECTOR-NEXT: ld %f8, 216(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f9, 208(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f10, 200(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f11, 192(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f12, 184(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f13, 176(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f14, 168(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f15, 160(%r15) # 8-byte Reload -; VECTOR-NEXT: lmg %r13, %r15, 328(%r15) -; VECTOR-NEXT: br %r14 -entry: - %Res = fadd <8 x half> %Op, %Op - ret <8 x half> %Res -} - -; Same, but with partial vector values. -define <4 x half> @fun1(<4 x half> %Op) { -; NOVEC-LABEL: fun1: -; NOVEC: # %bb.0: # %entry -; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) -; NOVEC-NEXT: .cfi_offset %r14, -48 -; NOVEC-NEXT: .cfi_offset %r15, -40 -; NOVEC-NEXT: aghi %r15, -192 -; NOVEC-NEXT: .cfi_def_cfa_offset 352 -; NOVEC-NEXT: std %f8, 184(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f9, 176(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f10, 168(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f11, 160(%r15) # 8-byte Spill -; NOVEC-NEXT: .cfi_offset %f8, -168 -; NOVEC-NEXT: .cfi_offset %f9, -176 -; NOVEC-NEXT: .cfi_offset %f10, -184 -; NOVEC-NEXT: .cfi_offset %f11, -192 -; NOVEC-NEXT: ler %f8, %f6 -; NOVEC-NEXT: ler %f9, %f4 -; NOVEC-NEXT: ler %f10, %f2 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f11, %f0 -; NOVEC-NEXT: ler %f0, %f10 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f10, %f0 -; NOVEC-NEXT: ler %f0, %f9 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f9, %f0 -; NOVEC-NEXT: ler %f0, %f8 -; NOVEC-NEXT: brasl %r14, __extendhfsf2@PLT -; NOVEC-NEXT: aebr %f0, %f0 -; NOVEC-NEXT: brasl %r14, __truncsfhf2@PLT -; NOVEC-NEXT: ler %f6, %f0 -; NOVEC-NEXT: ler %f0, %f11 -; NOVEC-NEXT: ler %f2, %f10 -; NOVEC-NEXT: ler %f4, %f9 -; NOVEC-NEXT: ld %f8, 184(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f9, 176(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f10, 168(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f11, 160(%r15) # 8-byte Reload -; NOVEC-NEXT: lmg %r14, %r15, 304(%r15) -; NOVEC-NEXT: br %r14 -; -; VECTOR-LABEL: fun1: -; VECTOR: # %bb.0: # %entry -; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) -; VECTOR-NEXT: .cfi_offset %r14, -48 -; VECTOR-NEXT: .cfi_offset %r15, -40 -; VECTOR-NEXT: aghi %r15, -192 -; VECTOR-NEXT: .cfi_def_cfa_offset 352 -; VECTOR-NEXT: std %f8, 184(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f9, 176(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f10, 168(%r15) # 8-byte Spill -; VECTOR-NEXT: std %f11, 160(%r15) # 8-byte Spill -; VECTOR-NEXT: .cfi_offset %f8, -168 -; VECTOR-NEXT: .cfi_offset %f9, -176 -; VECTOR-NEXT: .cfi_offset %f10, -184 -; VECTOR-NEXT: .cfi_offset %f11, -192 -; VECTOR-NEXT: ldr %f8, %f6 -; VECTOR-NEXT: ldr %f9, %f4 -; VECTOR-NEXT: ldr %f10, %f2 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f11, %f0 -; VECTOR-NEXT: ldr %f0, %f10 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f10, %f0 -; VECTOR-NEXT: ldr %f0, %f9 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f9, %f0 -; VECTOR-NEXT: ldr %f0, %f8 -; VECTOR-NEXT: brasl %r14, __extendhfsf2@PLT -; VECTOR-NEXT: aebr %f0, %f0 -; VECTOR-NEXT: brasl %r14, __truncsfhf2@PLT -; VECTOR-NEXT: ldr %f6, %f0 -; VECTOR-NEXT: ldr %f0, %f11 -; VECTOR-NEXT: ldr %f2, %f10 -; VECTOR-NEXT: ldr %f4, %f9 -; VECTOR-NEXT: ld %f8, 184(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f9, 176(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f10, 168(%r15) # 8-byte Reload -; VECTOR-NEXT: ld %f11, 160(%r15) # 8-byte Reload -; VECTOR-NEXT: lmg %r14, %r15, 304(%r15) -; VECTOR-NEXT: br %r14 -entry: - %Res = fadd <4 x half> %Op, %Op - ret <4 x half> %Res -} - -; Test a vector extension. -define <2 x half> @fun2(<2 x half> %Op) { -; NOVEC-LABEL: fun2: -; NOVEC: # %bb.0: # %entry -; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) -; NOVEC-NEXT: .cfi_offset %r14, -48 -; NOVEC-NEXT: .cfi_offset %r15, -40 -; NOVEC-NEXT: aghi %r15, -176 -; NOVEC-NEXT: .cfi_def_cfa_offset 336 -; NOVEC-NEXT: std %f8, 168(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f9, 160(%r15) # 8-byte Spill -; NOVEC-NEXT: .cfi_offset %f8, -168 -; NOVEC-NEXT: .cfi_offset %f9, -176 -; NOVEC-NEXT: ler %f8, %f2 -; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT -; NOVEC-NEXT: ldr %f9, %f0 -; NOVEC-NEXT: ler %f0, %f8 -; NOVEC-NEXT: brasl %r14, __extendhfdf2@PLT -; NOVEC-NEXT: adbr %f9, %f9 -; NOVEC-NEXT: ldr %f8, %f0 -; NOVEC-NEXT: adbr %f8, %f0 -; NOVEC-NEXT: ldr %f0, %f9 -; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT -; NOVEC-NEXT: ler %f9, %f0 -; NOVEC-NEXT: ldr %f0, %f8 -; NOVEC-NEXT: brasl %r14, __truncdfhf2@PLT -; NOVEC-NEXT: ler %f2, %f0 -; NOVEC-NEXT: ler %f0, %f9 -; NOVEC-NEXT: ld %f8, 168(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f9, 160(%r15) # 8-byte Reload -; NOVEC-NEXT: lmg %r14, %r15, 288(%r15) -; NOVEC-NEXT: br %r14 -; -; VECTOR-LABEL: fun2: -; VECTOR: # %bb.0: # %entry -; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) -; VECTOR-NEXT: .cfi_offset %r14, -48 -; VECTOR-NEXT: .cfi_offset %r15, -40 -; VECTOR-NEXT: aghi %r15, -184 -; VECTOR-NEXT: .cfi_def_cfa_offset 344 -; VECTOR-NEXT: std %f8, 176(%r15) # 8-byte Spill -; VECTOR-NEXT: .cfi_offset %f8, -168 -; VECTOR-NEXT: ldr %f8, %f0 -; VECTOR-NEXT: ldr %f0, %f2 -; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT -; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0 -; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill -; VECTOR-NEXT: ldr %f0, %f8 -; VECTOR-NEXT: brasl %r14, __extendhfdf2@PLT -; VECTOR-NEXT: vl %v1, 160(%r15), 3 # 16-byte Reload -; VECTOR-NEXT: # kill: def $f0d killed $f0d def $v0 -; VECTOR-NEXT: vmrhg %v0, %v0, %v1 -; VECTOR-NEXT: vfadb %v0, %v0, %v0 -; VECTOR-NEXT: vst %v0, 160(%r15), 3 # 16-byte Spill -; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 -; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT -; VECTOR-NEXT: ldr %f8, %f0 -; VECTOR-NEXT: vl %v0, 160(%r15), 3 # 16-byte Reload -; VECTOR-NEXT: vrepg %v0, %v0, 1 -; VECTOR-NEXT: # kill: def $f0d killed $f0d killed $v0 -; VECTOR-NEXT: brasl %r14, __truncdfhf2@PLT -; VECTOR-NEXT: ldr %f2, %f0 -; VECTOR-NEXT: ldr %f0, %f8 -; VECTOR-NEXT: ld %f8, 176(%r15) # 8-byte Reload -; VECTOR-NEXT: lmg %r14, %r15, 296(%r15) -; VECTOR-NEXT: br %r14 -entry: - %E = fpext <2 x half> %Op to <2 x double> - %Add = fadd <2 x double> %E, %E - %Res = fptrunc <2 x double> %Add to <2 x half> - ret <2 x half> %Res -} - -; Load and store an <8 x half> vector. -define void @fun3(ptr %Src, ptr %Dst) { -; NOVEC-LABEL: fun3: -; NOVEC: # %bb.0: # %entry -; NOVEC-NEXT: lgh %r0, 0(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f0, %r0 -; NOVEC-NEXT: lgh %r0, 2(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f1, %r0 -; NOVEC-NEXT: lgh %r0, 4(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f2, %r0 -; NOVEC-NEXT: lgh %r0, 6(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f3, %r0 -; NOVEC-NEXT: lgh %r0, 8(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f4, %r0 -; NOVEC-NEXT: lgh %r0, 10(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f5, %r0 -; NOVEC-NEXT: lgh %r0, 12(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f6, %r0 -; NOVEC-NEXT: lgh %r0, 14(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f7, %r0 -; NOVEC-NEXT: lgdr %r0, %f7 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 14(%r3) -; NOVEC-NEXT: lgdr %r0, %f6 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 12(%r3) -; NOVEC-NEXT: lgdr %r0, %f5 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 10(%r3) -; NOVEC-NEXT: lgdr %r0, %f4 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 8(%r3) -; NOVEC-NEXT: lgdr %r0, %f3 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 6(%r3) -; NOVEC-NEXT: lgdr %r0, %f2 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 4(%r3) -; NOVEC-NEXT: lgdr %r0, %f1 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 2(%r3) -; NOVEC-NEXT: lgdr %r0, %f0 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 0(%r3) -; NOVEC-NEXT: br %r14 -; -; VECTOR-LABEL: fun3: -; VECTOR: # %bb.0: # %entry -; VECTOR-NEXT: vlreph %v0, 0(%r2) -; VECTOR-NEXT: vlreph %v1, 2(%r2) -; VECTOR-NEXT: vlreph %v2, 4(%r2) -; VECTOR-NEXT: vlreph %v3, 6(%r2) -; VECTOR-NEXT: vlreph %v4, 8(%r2) -; VECTOR-NEXT: vlreph %v5, 10(%r2) -; VECTOR-NEXT: vlreph %v6, 12(%r2) -; VECTOR-NEXT: vlreph %v7, 14(%r2) -; VECTOR-NEXT: vsteh %v7, 14(%r3), 0 -; VECTOR-NEXT: vsteh %v6, 12(%r3), 0 -; VECTOR-NEXT: vsteh %v5, 10(%r3), 0 -; VECTOR-NEXT: vsteh %v4, 8(%r3), 0 -; VECTOR-NEXT: vsteh %v3, 6(%r3), 0 -; VECTOR-NEXT: vsteh %v2, 4(%r3), 0 -; VECTOR-NEXT: vsteh %v1, 2(%r3), 0 -; VECTOR-NEXT: vsteh %v0, 0(%r3), 0 -; VECTOR-NEXT: br %r14 -entry: - %L = load <8 x half>, ptr %Src - store <8 x half> %L, ptr %Dst - ret void -} - -; Call a function with <8 x half> argument and return values. -declare <8 x half> @foo(<8 x half>) -define void @fun4(ptr %Src, ptr %Dst) { -; NOVEC-LABEL: fun4: -; NOVEC: # %bb.0: # %entry -; NOVEC-NEXT: stmg %r13, %r15, 104(%r15) -; NOVEC-NEXT: .cfi_offset %r13, -56 -; NOVEC-NEXT: .cfi_offset %r14, -48 -; NOVEC-NEXT: .cfi_offset %r15, -40 -; NOVEC-NEXT: aghi %r15, -208 -; NOVEC-NEXT: .cfi_def_cfa_offset 368 -; NOVEC-NEXT: lgh %r0, 0(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f0, %r0 -; NOVEC-NEXT: lgh %r0, 2(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f2, %r0 -; NOVEC-NEXT: # kill: def $f0h killed $f0h killed $f0d -; NOVEC-NEXT: # kill: def $f2h killed $f2h killed $f2d -; NOVEC-NEXT: lgh %r0, 4(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f4, %r0 -; NOVEC-NEXT: # kill: def $f4h killed $f4h killed $f4d -; NOVEC-NEXT: lgh %r0, 6(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f6, %r0 -; NOVEC-NEXT: # kill: def $f6h killed $f6h killed $f6d -; NOVEC-NEXT: lgh %r0, 8(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f1, %r0 -; NOVEC-NEXT: lgh %r0, 10(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f3, %r0 -; NOVEC-NEXT: lgh %r0, 12(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f5, %r0 -; NOVEC-NEXT: lgh %r0, 14(%r2) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f7, %r0 -; NOVEC-NEXT: lgdr %r0, %f7 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 190(%r15) -; NOVEC-NEXT: lgdr %r0, %f5 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 182(%r15) -; NOVEC-NEXT: lgdr %r0, %f3 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 174(%r15) -; NOVEC-NEXT: lgdr %r0, %f1 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: la %r2, 192(%r15) -; NOVEC-NEXT: lgr %r13, %r3 -; NOVEC-NEXT: sth %r0, 166(%r15) -; NOVEC-NEXT: brasl %r14, foo@PLT -; NOVEC-NEXT: lgh %r0, 192(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f0, %r0 -; NOVEC-NEXT: lgh %r0, 194(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f1, %r0 -; NOVEC-NEXT: lgh %r0, 196(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f2, %r0 -; NOVEC-NEXT: lgh %r0, 198(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f3, %r0 -; NOVEC-NEXT: lgh %r0, 200(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f4, %r0 -; NOVEC-NEXT: lgh %r0, 202(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f5, %r0 -; NOVEC-NEXT: lgh %r0, 204(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f6, %r0 -; NOVEC-NEXT: lgh %r0, 206(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f7, %r0 -; NOVEC-NEXT: lgdr %r0, %f7 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 14(%r13) -; NOVEC-NEXT: lgdr %r0, %f6 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 12(%r13) -; NOVEC-NEXT: lgdr %r0, %f5 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 10(%r13) -; NOVEC-NEXT: lgdr %r0, %f4 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 8(%r13) -; NOVEC-NEXT: lgdr %r0, %f3 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 6(%r13) -; NOVEC-NEXT: lgdr %r0, %f2 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 4(%r13) -; NOVEC-NEXT: lgdr %r0, %f1 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 2(%r13) -; NOVEC-NEXT: lgdr %r0, %f0 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 0(%r13) -; NOVEC-NEXT: lmg %r13, %r15, 312(%r15) -; NOVEC-NEXT: br %r14 -; -; VECTOR-LABEL: fun4: -; VECTOR: # %bb.0: # %entry -; VECTOR-NEXT: stmg %r13, %r15, 104(%r15) -; VECTOR-NEXT: .cfi_offset %r13, -56 -; VECTOR-NEXT: .cfi_offset %r14, -48 -; VECTOR-NEXT: .cfi_offset %r15, -40 -; VECTOR-NEXT: aghi %r15, -208 -; VECTOR-NEXT: .cfi_def_cfa_offset 368 -; VECTOR-NEXT: vlreph %v6, 6(%r2) -; VECTOR-NEXT: vlreph %v4, 4(%r2) -; VECTOR-NEXT: vlreph %v2, 2(%r2) -; VECTOR-NEXT: vlreph %v0, 0(%r2) -; VECTOR-NEXT: vlreph %v1, 8(%r2) -; VECTOR-NEXT: vlreph %v3, 10(%r2) -; VECTOR-NEXT: vlreph %v5, 12(%r2) -; VECTOR-NEXT: vlreph %v7, 14(%r2) -; VECTOR-NEXT: la %r2, 192(%r15) -; VECTOR-NEXT: lgr %r13, %r3 -; VECTOR-NEXT: vsteh %v7, 190(%r15), 0 -; VECTOR-NEXT: vsteh %v5, 182(%r15), 0 -; VECTOR-NEXT: vsteh %v3, 174(%r15), 0 -; VECTOR-NEXT: vsteh %v1, 166(%r15), 0 -; VECTOR-NEXT: brasl %r14, foo@PLT -; VECTOR-NEXT: vlreph %v0, 192(%r15) -; VECTOR-NEXT: vlreph %v1, 194(%r15) -; VECTOR-NEXT: vlreph %v2, 196(%r15) -; VECTOR-NEXT: vlreph %v3, 198(%r15) -; VECTOR-NEXT: vlreph %v4, 200(%r15) -; VECTOR-NEXT: vlreph %v5, 202(%r15) -; VECTOR-NEXT: vlreph %v6, 204(%r15) -; VECTOR-NEXT: vlreph %v7, 206(%r15) -; VECTOR-NEXT: vsteh %v7, 14(%r13), 0 -; VECTOR-NEXT: vsteh %v6, 12(%r13), 0 -; VECTOR-NEXT: vsteh %v5, 10(%r13), 0 -; VECTOR-NEXT: vsteh %v4, 8(%r13), 0 -; VECTOR-NEXT: vsteh %v3, 6(%r13), 0 -; VECTOR-NEXT: vsteh %v2, 4(%r13), 0 -; VECTOR-NEXT: vsteh %v1, 2(%r13), 0 -; VECTOR-NEXT: vsteh %v0, 0(%r13), 0 -; VECTOR-NEXT: lmg %r13, %r15, 312(%r15) -; VECTOR-NEXT: br %r14 -entry: - %arg = load <8 x half>, ptr %Src - %Res = call <8 x half> @foo(<8 x half> %arg) - store <8 x half> %Res, ptr %Dst - ret void -} - -; Receive and pass argument fully on stack. -declare void @foo2(<4 x half> %dummy, <8 x half> %Arg5) -define void @fun5(<4 x half> %dummy, <8 x half> %Arg5) { -; NOVEC-LABEL: fun5: -; NOVEC: # %bb.0: -; NOVEC-NEXT: stmg %r14, %r15, 112(%r15) -; NOVEC-NEXT: .cfi_offset %r14, -48 -; NOVEC-NEXT: .cfi_offset %r15, -40 -; NOVEC-NEXT: aghi %r15, -256 -; NOVEC-NEXT: .cfi_def_cfa_offset 416 -; NOVEC-NEXT: std %f8, 248(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f9, 240(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f10, 232(%r15) # 8-byte Spill -; NOVEC-NEXT: std %f11, 224(%r15) # 8-byte Spill -; NOVEC-NEXT: .cfi_offset %f8, -168 -; NOVEC-NEXT: .cfi_offset %f9, -176 -; NOVEC-NEXT: .cfi_offset %f10, -184 -; NOVEC-NEXT: .cfi_offset %f11, -192 -; NOVEC-NEXT: lgh %r0, 422(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f1, %r0 -; NOVEC-NEXT: lgh %r0, 430(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f3, %r0 -; NOVEC-NEXT: lgh %r0, 438(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f5, %r0 -; NOVEC-NEXT: lgh %r0, 446(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f7, %r0 -; NOVEC-NEXT: lgh %r0, 454(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f8, %r0 -; NOVEC-NEXT: lgh %r0, 462(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f9, %r0 -; NOVEC-NEXT: lgh %r0, 470(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f10, %r0 -; NOVEC-NEXT: lgh %r0, 478(%r15) -; NOVEC-NEXT: sllg %r0, %r0, 48 -; NOVEC-NEXT: ldgr %f11, %r0 -; NOVEC-NEXT: lgdr %r0, %f11 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 222(%r15) -; NOVEC-NEXT: lgdr %r0, %f10 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 214(%r15) -; NOVEC-NEXT: lgdr %r0, %f9 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 206(%r15) -; NOVEC-NEXT: lgdr %r0, %f8 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 198(%r15) -; NOVEC-NEXT: lgdr %r0, %f7 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 190(%r15) -; NOVEC-NEXT: lgdr %r0, %f5 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 182(%r15) -; NOVEC-NEXT: lgdr %r0, %f3 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 174(%r15) -; NOVEC-NEXT: lgdr %r0, %f1 -; NOVEC-NEXT: srlg %r0, %r0, 48 -; NOVEC-NEXT: sth %r0, 166(%r15) -; NOVEC-NEXT: brasl %r14, foo2@PLT -; NOVEC-NEXT: ld %f8, 248(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f9, 240(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f10, 232(%r15) # 8-byte Reload -; NOVEC-NEXT: ld %f11, 224(%r15) # 8-byte Reload -; NOVEC-NEXT: lmg %r14, %r15, 368(%r15) -; NOVEC-NEXT: br %r14 -; -; VECTOR-LABEL: fun5: -; VECTOR: # %bb.0: -; VECTOR-NEXT: stmg %r14, %r15, 112(%r15) -; VECTOR-NEXT: .cfi_offset %r14, -48 -; VECTOR-NEXT: .cfi_offset %r15, -40 -; VECTOR-NEXT: aghi %r15, -224 -; VECTOR-NEXT: .cfi_def_cfa_offset 384 -; VECTOR-NEXT: vlreph %v1, 390(%r15) -; VECTOR-NEXT: vlreph %v3, 398(%r15) -; VECTOR-NEXT: vlreph %v5, 406(%r15) -; VECTOR-NEXT: vlreph %v7, 414(%r15) -; VECTOR-NEXT: vlreph %v16, 422(%r15) -; VECTOR-NEXT: vlreph %v17, 430(%r15) -; VECTOR-NEXT: vlreph %v18, 438(%r15) -; VECTOR-NEXT: vlreph %v19, 446(%r15) -; VECTOR-NEXT: vsteh %v19, 222(%r15), 0 -; VECTOR-NEXT: vsteh %v18, 214(%r15), 0 -; VECTOR-NEXT: vsteh %v17, 206(%r15), 0 -; VECTOR-NEXT: vsteh %v16, 198(%r15), 0 -; VECTOR-NEXT: vsteh %v7, 190(%r15), 0 -; VECTOR-NEXT: vsteh %v5, 182(%r15), 0 -; VECTOR-NEXT: vsteh %v3, 174(%r15), 0 -; VECTOR-NEXT: vsteh %v1, 166(%r15), 0 -; VECTOR-NEXT: brasl %r14, foo2@PLT -; VECTOR-NEXT: lmg %r14, %r15, 336(%r15) -; VECTOR-NEXT: br %r14 - call void @foo2(<4 x half> %dummy, <8 x half> %Arg5) - ret void -} diff --git a/llvm/test/CodeGen/SystemZ/vec-abi-01.ll b/llvm/test/CodeGen/SystemZ/vec-abi-01.ll new file mode 100644 index 0000000000000..6ce8288aa8499 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-abi-01.ll @@ -0,0 +1,2489 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; +; Test passing vector arguments per the ABI for z10 (without vector support). +; The function names codify the element type and the size of the vector in +; bytes, just like in the clang test systemz-abi-vector.c + +@global_char_1 = global <1 x i8> zeroinitializer, align 2 +@global_char_8 = global <8 x i8> zeroinitializer, align 8 +@global_char_16 = global <16 x i8> zeroinitializer, align 16 +@global_char_32 = global <32 x i8> zeroinitializer, align 32 +@global_short_2 = global <1 x i16> zeroinitializer, align 2 +@global_short_8 = global <4 x i16> zeroinitializer, align 8 +@global_short_16 = global <8 x i16> zeroinitializer, align 16 +@global_int_4 = global <1 x i32> zeroinitializer, align 4 +@global_int_8 = global <2 x i32> zeroinitializer, align 8 +@global_int_16 = global <4 x i32> zeroinitializer, align 16 +@global_int_32 = global <8 x i32> zeroinitializer, align 32 +@global_long_8 = global <1 x i64> zeroinitializer, align 8 +@global_long_16 = global <2 x i64> zeroinitializer, align 16 +@global___int128_16 = global <1 x i128> zeroinitializer, align 16 +@global___int128_32 = global <2 x i128> zeroinitializer, align 32 +@global__Float16_2 = global <1 x half> zeroinitializer, align 2 +@global__Float16_8 = global <4 x half> zeroinitializer, align 8 +@global__Float16_16 = global <8 x half> zeroinitializer, align 16 +@global__Float16_32 = global <16 x half> zeroinitializer, align 32 +@global_float_4 = global <1 x float> zeroinitializer, align 4 +@global_float_8 = global <2 x float> zeroinitializer, align 8 +@global_float_16 = global <4 x float> zeroinitializer, align 16 +@global_double_8 = global <1 x double> zeroinitializer, align 8 +@global_double_16 = global <2 x double> zeroinitializer, align 16 +@global_double_32 = global <4 x double> zeroinitializer, align 32 +@global_long_double_16 = global <1 x fp128> zeroinitializer, align 16 +@global_long_double_32 = global <2 x fp128> zeroinitializer, align 32 + +define void @takeAndStore_char_1(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_char_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lb %r0, 0(%r2) +; CHECK-NEXT: lgrl %r1, global_char_1@GOT +; CHECK-NEXT: stc %r0, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <1 x i8>, ptr %0, align 1 + store <1 x i8> %x, ptr @global_char_1, align 2 + ret void +} + +define void @takeAndStore_char_8(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_char_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: lh %r0, 0(%r2) +; CHECK-NEXT: lb %r1, 2(%r2) +; CHECK-NEXT: lb %r3, 3(%r2) +; CHECK-NEXT: lb %r4, 7(%r2) +; CHECK-NEXT: lgrl %r5, global_char_8@GOT +; CHECK-NEXT: lb %r14, 6(%r2) +; CHECK-NEXT: lb %r13, 5(%r2) +; CHECK-NEXT: lb %r2, 4(%r2) +; CHECK-NEXT: stc %r4, 7(%r5) +; CHECK-NEXT: stc %r14, 6(%r5) +; CHECK-NEXT: stc %r13, 5(%r5) +; CHECK-NEXT: stc %r2, 4(%r5) +; CHECK-NEXT: stc %r3, 3(%r5) +; CHECK-NEXT: stc %r1, 2(%r5) +; CHECK-NEXT: sth %r0, 0(%r5) +; CHECK-NEXT: lmg %r13, %r15, 104(%r15) +; CHECK-NEXT: br %r14 +entry: + %x = load <8 x i8>, ptr %0, align 8 + store <8 x i8> %x, ptr @global_char_8, align 8 + ret void +} + +define void @takeAndStore_char_16(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_char_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lg %r0, 8(%r2) +; CHECK-NEXT: lgrl %r1, global_char_16@GOT +; CHECK-NEXT: lg %r2, 0(%r2) +; CHECK-NEXT: stg %r0, 8(%r1) +; CHECK-NEXT: stg %r2, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <16 x i8>, ptr %0, align 16 + store <16 x i8> %x, ptr @global_char_16, align 16 + ret void +} + +define void @takeAndStore_char_32(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_char_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lg %r0, 24(%r2) +; CHECK-NEXT: lgrl %r1, global_char_32@GOT +; CHECK-NEXT: lg %r3, 16(%r2) +; CHECK-NEXT: lg %r4, 8(%r2) +; CHECK-NEXT: lg %r2, 0(%r2) +; CHECK-NEXT: stg %r0, 24(%r1) +; CHECK-NEXT: stg %r3, 16(%r1) +; CHECK-NEXT: stg %r4, 8(%r1) +; CHECK-NEXT: stg %r2, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <32 x i8>, ptr %0, align 32 + store <32 x i8> %x, ptr @global_char_32, align 32 + ret void +} + +define void @takeAndStore_short_2(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_short_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lh %r0, 0(%r2) +; CHECK-NEXT: lgrl %r1, global_short_2@GOT +; CHECK-NEXT: sth %r0, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <1 x i16>, ptr %0, align 2 + store <1 x i16> %x, ptr @global_short_2, align 2 + ret void +} + +define void @takeAndStore_short_8(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_short_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lh %r0, 6(%r2) +; CHECK-NEXT: lgrl %r1, global_short_8@GOT +; CHECK-NEXT: lh %r3, 4(%r2) +; CHECK-NEXT: l %r2, 0(%r2) +; CHECK-NEXT: sth %r0, 6(%r1) +; CHECK-NEXT: sth %r3, 4(%r1) +; CHECK-NEXT: st %r2, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <4 x i16>, ptr %0, align 8 + store <4 x i16> %x, ptr @global_short_8, align 8 + ret void +} + +define void @takeAndStore_short_16(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_short_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: l %r0, 0(%r2) +; CHECK-NEXT: lh %r1, 4(%r2) +; CHECK-NEXT: lh %r3, 6(%r2) +; CHECK-NEXT: lh %r4, 14(%r2) +; CHECK-NEXT: lgrl %r5, global_short_16@GOT +; CHECK-NEXT: lh %r14, 12(%r2) +; CHECK-NEXT: lh %r13, 10(%r2) +; CHECK-NEXT: lh %r2, 8(%r2) +; CHECK-NEXT: sth %r4, 14(%r5) +; CHECK-NEXT: sth %r14, 12(%r5) +; CHECK-NEXT: sth %r13, 10(%r5) +; CHECK-NEXT: sth %r2, 8(%r5) +; CHECK-NEXT: sth %r3, 6(%r5) +; CHECK-NEXT: sth %r1, 4(%r5) +; CHECK-NEXT: st %r0, 0(%r5) +; CHECK-NEXT: lmg %r13, %r15, 104(%r15) +; CHECK-NEXT: br %r14 +entry: + %x = load <8 x i16>, ptr %0, align 16 + store <8 x i16> %x, ptr @global_short_16, align 16 + ret void +} + +define void @takeAndStore_int_4(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_int_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: l %r0, 0(%r2) +; CHECK-NEXT: lgrl %r1, global_int_4@GOT +; CHECK-NEXT: st %r0, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <1 x i32>, ptr %0, align 4 + store <1 x i32> %x, ptr @global_int_4, align 4 + ret void +} + +define void @takeAndStore_int_8(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_int_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lg %r0, 0(%r2) +; CHECK-NEXT: lgrl %r1, global_int_8@GOT +; CHECK-NEXT: stg %r0, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <2 x i32>, ptr %0, align 8 + store <2 x i32> %x, ptr @global_int_8, align 8 + ret void +} + +define void @takeAndStore_int_16(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_int_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: l %r0, 12(%r2) +; CHECK-NEXT: lgrl %r1, global_int_16@GOT +; CHECK-NEXT: l %r3, 8(%r2) +; CHECK-NEXT: lg %r2, 0(%r2) +; CHECK-NEXT: st %r0, 12(%r1) +; CHECK-NEXT: st %r3, 8(%r1) +; CHECK-NEXT: stg %r2, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <4 x i32>, ptr %0, align 16 + store <4 x i32> %x, ptr @global_int_16, align 16 + ret void +} + +define void @takeAndStore_int_32(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_int_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: l %r0, 8(%r2) +; CHECK-NEXT: l %r1, 12(%r2) +; CHECK-NEXT: l %r3, 16(%r2) +; CHECK-NEXT: l %r4, 28(%r2) +; CHECK-NEXT: lgrl %r5, global_int_32@GOT +; CHECK-NEXT: l %r14, 24(%r2) +; CHECK-NEXT: l %r13, 20(%r2) +; CHECK-NEXT: lg %r2, 0(%r2) +; CHECK-NEXT: st %r4, 28(%r5) +; CHECK-NEXT: st %r14, 24(%r5) +; CHECK-NEXT: st %r13, 20(%r5) +; CHECK-NEXT: st %r3, 16(%r5) +; CHECK-NEXT: st %r1, 12(%r5) +; CHECK-NEXT: st %r0, 8(%r5) +; CHECK-NEXT: stg %r2, 0(%r5) +; CHECK-NEXT: lmg %r13, %r15, 104(%r15) +; CHECK-NEXT: br %r14 +entry: + %x = load <8 x i32>, ptr %0, align 32 + store <8 x i32> %x, ptr @global_int_32, align 32 + ret void +} + +define void @takeAndStore_long_8(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_long_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lg %r0, 0(%r2) +; CHECK-NEXT: lgrl %r1, global_long_8@GOT +; CHECK-NEXT: stg %r0, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <1 x i64>, ptr %0, align 8 + store <1 x i64> %x, ptr @global_long_8, align 8 + ret void +} + +define void @takeAndStore_long_16(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_long_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lg %r0, 8(%r2) +; CHECK-NEXT: lgrl %r1, global_long_16@GOT +; CHECK-NEXT: lg %r2, 0(%r2) +; CHECK-NEXT: stg %r0, 8(%r1) +; CHECK-NEXT: stg %r2, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <2 x i64>, ptr %0, align 16 + store <2 x i64> %x, ptr @global_long_16, align 16 + ret void +} + +define void @takeAndStore___int128_16(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore___int128_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lg %r0, 0(%r2) +; CHECK-NEXT: lgrl %r1, global___int128_16@GOT +; CHECK-NEXT: lg %r2, 8(%r2) +; CHECK-NEXT: stg %r0, 0(%r1) +; CHECK-NEXT: stg %r2, 8(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <1 x i128>, ptr %0, align 16 + store <1 x i128> %x, ptr @global___int128_16, align 16 + ret void +} + +define void @takeAndStore___int128_32(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore___int128_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lg %r0, 16(%r2) +; CHECK-NEXT: lgrl %r1, global___int128_32@GOT +; CHECK-NEXT: lg %r3, 24(%r2) +; CHECK-NEXT: lg %r4, 0(%r2) +; CHECK-NEXT: lg %r2, 8(%r2) +; CHECK-NEXT: stg %r0, 16(%r1) +; CHECK-NEXT: stg %r3, 24(%r1) +; CHECK-NEXT: stg %r4, 0(%r1) +; CHECK-NEXT: stg %r2, 8(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <2 x i128>, ptr %0, align 32 + store <2 x i128> %x, ptr @global___int128_32, align 32 + ret void +} + +define void @takeAndStore__Float16_2(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore__Float16_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgh %r0, 0(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgrl %r1, global__Float16_2@GOT +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <1 x half>, ptr %0, align 2 + store <1 x half> %x, ptr @global__Float16_2, align 2 + ret void +} + +define void @takeAndStore__Float16_8(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore__Float16_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgh %r0, 4(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgh %r1, 6(%r2) +; CHECK-NEXT: l %r2, 0(%r2) +; CHECK-NEXT: lgrl %r3, global__Float16_8@GOT +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f1, %r0 +; CHECK-NEXT: st %r2, 0(%r3) +; CHECK-NEXT: lgdr %r0, %f1 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r3) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r3) +; CHECK-NEXT: br %r14 +entry: + %x = load <4 x half>, ptr %0, align 8 + store <4 x half> %x, ptr @global__Float16_8, align 8 + ret void +} + +define void @takeAndStore__Float16_16(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore__Float16_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgh %r0, 4(%r2) +; CHECK-NEXT: lgh %r1, 6(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgh %r0, 8(%r2) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f1, %r1 +; CHECK-NEXT: lgh %r1, 10(%r2) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgh %r3, 12(%r2) +; CHECK-NEXT: ldgr %f2, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f3, %r0 +; CHECK-NEXT: sllg %r0, %r3, 48 +; CHECK-NEXT: lgh %r3, 14(%r2) +; CHECK-NEXT: l %r2, 0(%r2) +; CHECK-NEXT: lgrl %r1, global__Float16_16@GOT +; CHECK-NEXT: ldgr %f4, %r0 +; CHECK-NEXT: sllg %r0, %r3, 48 +; CHECK-NEXT: ldgr %f5, %r0 +; CHECK-NEXT: st %r2, 0(%r1) +; CHECK-NEXT: lgdr %r0, %f5 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 14(%r1) +; CHECK-NEXT: lgdr %r0, %f4 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 12(%r1) +; CHECK-NEXT: lgdr %r0, %f3 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 10(%r1) +; CHECK-NEXT: lgdr %r0, %f2 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 8(%r1) +; CHECK-NEXT: lgdr %r0, %f1 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r1) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <8 x half>, ptr %0, align 16 + store <8 x half> %x, ptr @global__Float16_16, align 16 + ret void +} + +define void @takeAndStore__Float16_32(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore__Float16_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lg %r0, 24(%r2) +; CHECK-NEXT: lgrl %r1, global__Float16_32@GOT +; CHECK-NEXT: lg %r3, 16(%r2) +; CHECK-NEXT: lg %r4, 8(%r2) +; CHECK-NEXT: lg %r2, 0(%r2) +; CHECK-NEXT: stg %r0, 24(%r1) +; CHECK-NEXT: stg %r3, 16(%r1) +; CHECK-NEXT: stg %r4, 8(%r1) +; CHECK-NEXT: stg %r2, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <16 x half>, ptr %0, align 32 + store <16 x half> %x, ptr @global__Float16_32, align 32 + ret void +} + +define void @takeAndStore_float_4(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_float_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: le %f0, 0(%r2) +; CHECK-NEXT: lgrl %r1, global_float_4@GOT +; CHECK-NEXT: ste %f0, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <1 x float>, ptr %0, align 4 + store <1 x float> %x, ptr @global_float_4, align 4 + ret void +} + +define void @takeAndStore_float_8(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_float_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lg %r0, 0(%r2) +; CHECK-NEXT: lgrl %r1, global_float_8@GOT +; CHECK-NEXT: stg %r0, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <2 x float>, ptr %0, align 8 + store <2 x float> %x, ptr @global_float_8, align 8 + ret void +} + +define void @takeAndStore_float_16(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_float_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: le %f0, 12(%r2) +; CHECK-NEXT: lgrl %r1, global_float_16@GOT +; CHECK-NEXT: le %f1, 8(%r2) +; CHECK-NEXT: lg %r0, 0(%r2) +; CHECK-NEXT: ste %f0, 12(%r1) +; CHECK-NEXT: ste %f1, 8(%r1) +; CHECK-NEXT: stg %r0, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <4 x float>, ptr %0, align 16 + store <4 x float> %x, ptr @global_float_16, align 16 + ret void +} + +define void @takeAndStore_double_8(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_double_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 0(%r2) +; CHECK-NEXT: lgrl %r1, global_double_8@GOT +; CHECK-NEXT: std %f0, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <1 x double>, ptr %0, align 8 + store <1 x double> %x, ptr @global_double_8, align 8 + ret void +} + +define void @takeAndStore_double_16(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 8(%r2) +; CHECK-NEXT: lgrl %r1, global_double_16@GOT +; CHECK-NEXT: ld %f1, 0(%r2) +; CHECK-NEXT: std %f0, 8(%r1) +; CHECK-NEXT: std %f1, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <2 x double>, ptr %0, align 16 + store <2 x double> %x, ptr @global_double_16, align 16 + ret void +} + +define void @takeAndStore_double_32(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 24(%r2) +; CHECK-NEXT: lgrl %r1, global_double_32@GOT +; CHECK-NEXT: ld %f1, 16(%r2) +; CHECK-NEXT: ld %f2, 8(%r2) +; CHECK-NEXT: ld %f3, 0(%r2) +; CHECK-NEXT: std %f0, 24(%r1) +; CHECK-NEXT: std %f1, 16(%r1) +; CHECK-NEXT: std %f2, 8(%r1) +; CHECK-NEXT: std %f3, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <4 x double>, ptr %0, align 32 + store <4 x double> %x, ptr @global_double_32, align 32 + ret void +} + +define void @takeAndStore_long_double_16(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_long_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 0(%r2) +; CHECK-NEXT: ld %f2, 8(%r2) +; CHECK-NEXT: lgrl %r1, global_long_double_16@GOT +; CHECK-NEXT: std %f0, 0(%r1) +; CHECK-NEXT: std %f2, 8(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <1 x fp128>, ptr %0, align 16 + store <1 x fp128> %x, ptr @global_long_double_16, align 16 + ret void +} + +define void @takeAndStore_long_double_32(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_long_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: ld %f0, 16(%r2) +; CHECK-NEXT: ld %f2, 24(%r2) +; CHECK-NEXT: lgrl %r1, global_long_double_32@GOT +; CHECK-NEXT: ld %f1, 0(%r2) +; CHECK-NEXT: ld %f3, 8(%r2) +; CHECK-NEXT: std %f0, 16(%r1) +; CHECK-NEXT: std %f2, 24(%r1) +; CHECK-NEXT: std %f1, 0(%r1) +; CHECK-NEXT: std %f3, 8(%r1) +; CHECK-NEXT: br %r14 +entry: + %x = load <2 x fp128>, ptr %0, align 32 + store <2 x fp128> %x, ptr @global_long_double_32, align 32 + ret void +} + +define void @loadAndReturn_char_1(ptr dead_on_unwind noalias writable writeonly sret(<1 x i8>) align 1 captures(none) initializes((0, 1)) %agg.result) { +; CHECK-LABEL: loadAndReturn_char_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_1@GOT +; CHECK-NEXT: mvc 0(1,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x i8>, ptr @global_char_1, align 2 + store <1 x i8> %0, ptr %agg.result, align 1 + ret void +} + +define void @loadAndReturn_char_8(ptr dead_on_unwind noalias writable writeonly sret(<8 x i8>) align 8 captures(none) initializes((0, 8)) %agg.result) { +; CHECK-LABEL: loadAndReturn_char_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_8@GOT +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <8 x i8>, ptr @global_char_8, align 8 + store <8 x i8> %0, ptr %agg.result, align 8 + ret void +} + +define void @loadAndReturn_char_16(ptr dead_on_unwind noalias writable writeonly sret(<16 x i8>) align 16 captures(none) initializes((0, 16)) %agg.result) { +; CHECK-LABEL: loadAndReturn_char_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <16 x i8>, ptr @global_char_16, align 16 + store <16 x i8> %0, ptr %agg.result, align 16 + ret void +} + +define void @loadAndReturn_char_32(ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 32 captures(none) initializes((0, 32)) %agg.result) { +; CHECK-LABEL: loadAndReturn_char_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_32@GOT +; CHECK-NEXT: mvc 24(8,%r2), 24(%r1) +; CHECK-NEXT: mvc 16(8,%r2), 16(%r1) +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <32 x i8>, ptr @global_char_32, align 32 + store <32 x i8> %0, ptr %agg.result, align 32 + ret void +} + +define void @loadAndReturn_short_2(ptr dead_on_unwind noalias writable writeonly sret(<1 x i16>) align 2 captures(none) initializes((0, 2)) %agg.result) { +; CHECK-LABEL: loadAndReturn_short_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_short_2@GOT +; CHECK-NEXT: mvc 0(2,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x i16>, ptr @global_short_2, align 2 + store <1 x i16> %0, ptr %agg.result, align 2 + ret void +} + +define void @loadAndReturn_short_8(ptr dead_on_unwind noalias writable writeonly sret(<4 x i16>) align 8 captures(none) initializes((0, 8)) %agg.result) { +; CHECK-LABEL: loadAndReturn_short_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_short_8@GOT +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x i16>, ptr @global_short_8, align 8 + store <4 x i16> %0, ptr %agg.result, align 8 + ret void +} + +define void @loadAndReturn_short_16(ptr dead_on_unwind noalias writable writeonly sret(<8 x i16>) align 16 captures(none) initializes((0, 16)) %agg.result) { +; CHECK-LABEL: loadAndReturn_short_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_short_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <8 x i16>, ptr @global_short_16, align 16 + store <8 x i16> %0, ptr %agg.result, align 16 + ret void +} + +define void @loadAndReturn_int_4(ptr dead_on_unwind noalias writable writeonly sret(<1 x i32>) align 4 captures(none) initializes((0, 4)) %agg.result) { +; CHECK-LABEL: loadAndReturn_int_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_4@GOT +; CHECK-NEXT: mvc 0(4,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x i32>, ptr @global_int_4, align 4 + store <1 x i32> %0, ptr %agg.result, align 4 + ret void +} + +define void @loadAndReturn_int_8(ptr dead_on_unwind noalias writable writeonly sret(<2 x i32>) align 8 captures(none) initializes((0, 8)) %agg.result) { +; CHECK-LABEL: loadAndReturn_int_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_8@GOT +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x i32>, ptr @global_int_8, align 8 + store <2 x i32> %0, ptr %agg.result, align 8 + ret void +} + +define void @loadAndReturn_int_16(ptr dead_on_unwind noalias writable writeonly sret(<4 x i32>) align 16 captures(none) initializes((0, 16)) %agg.result) { +; CHECK-LABEL: loadAndReturn_int_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x i32>, ptr @global_int_16, align 16 + store <4 x i32> %0, ptr %agg.result, align 16 + ret void +} + +define void @loadAndReturn_int_32(ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 32 captures(none) initializes((0, 32)) %agg.result) { +; CHECK-LABEL: loadAndReturn_int_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_32@GOT +; CHECK-NEXT: mvc 24(8,%r2), 24(%r1) +; CHECK-NEXT: mvc 16(8,%r2), 16(%r1) +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <8 x i32>, ptr @global_int_32, align 32 + store <8 x i32> %0, ptr %agg.result, align 32 + ret void +} + +define void @loadAndReturn_long_8(ptr dead_on_unwind noalias writable writeonly sret(<1 x i64>) align 8 captures(none) initializes((0, 8)) %agg.result) { +; CHECK-LABEL: loadAndReturn_long_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_8@GOT +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x i64>, ptr @global_long_8, align 8 + store <1 x i64> %0, ptr %agg.result, align 8 + ret void +} + +define void @loadAndReturn_long_16(ptr dead_on_unwind noalias writable writeonly sret(<2 x i64>) align 16 captures(none) initializes((0, 16)) %agg.result) { +; CHECK-LABEL: loadAndReturn_long_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x i64>, ptr @global_long_16, align 16 + store <2 x i64> %0, ptr %agg.result, align 16 + ret void +} + +define void @loadAndReturn___int128_16(ptr dead_on_unwind noalias writable writeonly sret(<1 x i128>) align 16 captures(none) initializes((0, 16)) %agg.result) { +; CHECK-LABEL: loadAndReturn___int128_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global___int128_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x i128>, ptr @global___int128_16, align 16 + store <1 x i128> %0, ptr %agg.result, align 16 + ret void +} + +define void @loadAndReturn___int128_32(ptr dead_on_unwind noalias writable writeonly sret(<2 x i128>) align 32 captures(none) initializes((0, 32)) %agg.result) { +; CHECK-LABEL: loadAndReturn___int128_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global___int128_32@GOT +; CHECK-NEXT: mvc 24(8,%r2), 24(%r1) +; CHECK-NEXT: mvc 16(8,%r2), 16(%r1) +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x i128>, ptr @global___int128_32, align 32 + store <2 x i128> %0, ptr %agg.result, align 32 + ret void +} + +define void @loadAndReturn__Float16_2(ptr dead_on_unwind noalias writable writeonly sret(<1 x half>) align 2 captures(none) initializes((0, 2)) %agg.result) { +; CHECK-LABEL: loadAndReturn__Float16_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_2@GOT +; CHECK-NEXT: lgh %r0, 0(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r2) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x half>, ptr @global__Float16_2, align 2 + store <1 x half> %0, ptr %agg.result, align 2 + ret void +} + +define void @loadAndReturn__Float16_8(ptr dead_on_unwind noalias writable writeonly sret(<4 x half>) align 8 captures(none) initializes((0, 8)) %agg.result) { +; CHECK-LABEL: loadAndReturn__Float16_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_8@GOT +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x half>, ptr @global__Float16_8, align 8 + store <4 x half> %0, ptr %agg.result, align 8 + ret void +} + +define void @loadAndReturn__Float16_16(ptr dead_on_unwind noalias writable writeonly sret(<8 x half>) align 16 captures(none) initializes((0, 16)) %agg.result) { +; CHECK-LABEL: loadAndReturn__Float16_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <8 x half>, ptr @global__Float16_16, align 16 + store <8 x half> %0, ptr %agg.result, align 16 + ret void +} + +define void @loadAndReturn__Float16_32(ptr dead_on_unwind noalias writable writeonly sret(<16 x half>) align 32 captures(none) initializes((0, 32)) %agg.result) { +; CHECK-LABEL: loadAndReturn__Float16_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_32@GOT +; CHECK-NEXT: mvc 24(8,%r2), 24(%r1) +; CHECK-NEXT: mvc 16(8,%r2), 16(%r1) +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <16 x half>, ptr @global__Float16_32, align 32 + store <16 x half> %0, ptr %agg.result, align 32 + ret void +} + +define void @loadAndReturn_float_4(ptr dead_on_unwind noalias writable writeonly sret(<1 x float>) align 4 captures(none) initializes((0, 4)) %agg.result) { +; CHECK-LABEL: loadAndReturn_float_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_float_4@GOT +; CHECK-NEXT: mvc 0(4,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x float>, ptr @global_float_4, align 4 + store <1 x float> %0, ptr %agg.result, align 4 + ret void +} + +define void @loadAndReturn_float_8(ptr dead_on_unwind noalias writable writeonly sret(<2 x float>) align 8 captures(none) initializes((0, 8)) %agg.result) { +; CHECK-LABEL: loadAndReturn_float_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_float_8@GOT +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x float>, ptr @global_float_8, align 8 + store <2 x float> %0, ptr %agg.result, align 8 + ret void +} + +define void @loadAndReturn_float_16(ptr dead_on_unwind noalias writable writeonly sret(<4 x float>) align 16 captures(none) initializes((0, 16)) %agg.result) { +; CHECK-LABEL: loadAndReturn_float_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_float_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x float>, ptr @global_float_16, align 16 + store <4 x float> %0, ptr %agg.result, align 16 + ret void +} + +define void @loadAndReturn_double_8(ptr dead_on_unwind noalias writable writeonly sret(<1 x double>) align 8 captures(none) initializes((0, 8)) %agg.result) { +; CHECK-LABEL: loadAndReturn_double_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_double_8@GOT +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x double>, ptr @global_double_8, align 8 + store <1 x double> %0, ptr %agg.result, align 8 + ret void +} + +define void @loadAndReturn_double_16(ptr dead_on_unwind noalias writable writeonly sret(<2 x double>) align 16 captures(none) initializes((0, 16)) %agg.result) { +; CHECK-LABEL: loadAndReturn_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_double_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x double>, ptr @global_double_16, align 16 + store <2 x double> %0, ptr %agg.result, align 16 + ret void +} + +define void @loadAndReturn_double_32(ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 32 captures(none) initializes((0, 32)) %agg.result) { +; CHECK-LABEL: loadAndReturn_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_double_32@GOT +; CHECK-NEXT: mvc 24(8,%r2), 24(%r1) +; CHECK-NEXT: mvc 16(8,%r2), 16(%r1) +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x double>, ptr @global_double_32, align 32 + store <4 x double> %0, ptr %agg.result, align 32 + ret void +} + +define void @loadAndReturn_long_double_16(ptr dead_on_unwind noalias writable writeonly sret(<1 x fp128>) align 16 captures(none) initializes((0, 16)) %agg.result) { +; CHECK-LABEL: loadAndReturn_long_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_double_16@GOT +; CHECK-NEXT: mvc 0(16,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x fp128>, ptr @global_long_double_16, align 16 + store <1 x fp128> %0, ptr %agg.result, align 16 + ret void +} + +define void @loadAndReturn_long_double_32(ptr dead_on_unwind noalias writable writeonly sret(<2 x fp128>) align 32 captures(none) initializes((0, 32)) %agg.result) { +; CHECK-LABEL: loadAndReturn_long_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_double_32@GOT +; CHECK-NEXT: mvc 16(16,%r2), 16(%r1) +; CHECK-NEXT: mvc 0(16,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x fp128>, ptr @global_long_double_32, align 32 + store <2 x fp128> %0, ptr %agg.result, align 32 + ret void +} + +define void @loadAndPass_char_1() { +; CHECK-LABEL: loadAndPass_char_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: lgrl %r1, global_char_1@GOT +; CHECK-NEXT: mvc 167(1,%r15), 0(%r1) +; CHECK-NEXT: la %r2, 167(%r15) +; CHECK-NEXT: brasl %r14, passCallee_char_1@PLT +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <1 x i8>, align 1 + %0 = load <1 x i8>, ptr @global_char_1, align 2 + store <1 x i8> %0, ptr %byval-temp, align 1 + call void @passCallee_char_1(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_char_1(ptr noundef dead_on_return) + + + +define void @loadAndPass_char_8() { +; CHECK-LABEL: loadAndPass_char_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: lgrl %r1, global_char_8@GOT +; CHECK-NEXT: mvc 160(8,%r15), 0(%r1) +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, passCallee_char_8@PLT +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <8 x i8>, align 8 + %0 = load <8 x i8>, ptr @global_char_8, align 8 + store <8 x i8> %0, ptr %byval-temp, align 8 + call void @passCallee_char_8(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_char_8(ptr noundef dead_on_return) + +define void @loadAndPass_char_16() { +; CHECK-LABEL: loadAndPass_char_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r2, 168(%r1) +; CHECK-NEXT: nill %r2, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global_char_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee_char_16@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <16 x i8>, align 16 + %0 = load <16 x i8>, ptr @global_char_16, align 16 + store <16 x i8> %0, ptr %byval-temp, align 16 + call void @passCallee_char_16(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_char_16(ptr noundef dead_on_return) + +define void @loadAndPass_char_32() { +; CHECK-LABEL: loadAndPass_char_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -56 +; CHECK-NEXT: la %r2, 184(%r1) +; CHECK-NEXT: nill %r2, 65504 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global_char_32@GOT +; CHECK-NEXT: mvc 24(8,%r2), 24(%r1) +; CHECK-NEXT: mvc 16(8,%r2), 16(%r1) +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee_char_32@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <32 x i8>, align 32 + %0 = load <32 x i8>, ptr @global_char_32, align 32 + store <32 x i8> %0, ptr %byval-temp, align 32 + call void @passCallee_char_32(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_char_32(ptr noundef dead_on_return) + +define void @loadAndPass_short_2() { +; CHECK-LABEL: loadAndPass_short_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: lgrl %r1, global_short_2@GOT +; CHECK-NEXT: mvc 166(2,%r15), 0(%r1) +; CHECK-NEXT: la %r2, 166(%r15) +; CHECK-NEXT: brasl %r14, passCallee_short_2@PLT +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <1 x i16>, align 2 + %0 = load <1 x i16>, ptr @global_short_2, align 2 + store <1 x i16> %0, ptr %byval-temp, align 2 + call void @passCallee_short_2(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_short_2(ptr noundef dead_on_return) + +define void @loadAndPass_short_8() { +; CHECK-LABEL: loadAndPass_short_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: lgrl %r1, global_short_8@GOT +; CHECK-NEXT: mvc 160(8,%r15), 0(%r1) +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, passCallee_short_8@PLT +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <4 x i16>, align 8 + %0 = load <4 x i16>, ptr @global_short_8, align 8 + store <4 x i16> %0, ptr %byval-temp, align 8 + call void @passCallee_short_8(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_short_8(ptr noundef dead_on_return) + +define void @loadAndPass_short_16() { +; CHECK-LABEL: loadAndPass_short_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r2, 168(%r1) +; CHECK-NEXT: nill %r2, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global_short_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee_short_16@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <8 x i16>, align 16 + %0 = load <8 x i16>, ptr @global_short_16, align 16 + store <8 x i16> %0, ptr %byval-temp, align 16 + call void @passCallee_short_16(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_short_16(ptr noundef dead_on_return) + +define void @loadAndPass_int_4() { +; CHECK-LABEL: loadAndPass_int_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: lgrl %r1, global_int_4@GOT +; CHECK-NEXT: mvc 164(4,%r15), 0(%r1) +; CHECK-NEXT: la %r2, 164(%r15) +; CHECK-NEXT: brasl %r14, passCallee_int_4@PLT +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <1 x i32>, align 4 + %0 = load <1 x i32>, ptr @global_int_4, align 4 + store <1 x i32> %0, ptr %byval-temp, align 4 + call void @passCallee_int_4(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_int_4(ptr noundef dead_on_return) + +define void @loadAndPass_int_8() { +; CHECK-LABEL: loadAndPass_int_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: lgrl %r1, global_int_8@GOT +; CHECK-NEXT: mvc 160(8,%r15), 0(%r1) +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, passCallee_int_8@PLT +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <2 x i32>, align 8 + %0 = load <2 x i32>, ptr @global_int_8, align 8 + store <2 x i32> %0, ptr %byval-temp, align 8 + call void @passCallee_int_8(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_int_8(ptr noundef dead_on_return) + +define void @loadAndPass_int_16() { +; CHECK-LABEL: loadAndPass_int_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r2, 168(%r1) +; CHECK-NEXT: nill %r2, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global_int_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee_int_16@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <4 x i32>, align 16 + %0 = load <4 x i32>, ptr @global_int_16, align 16 + store <4 x i32> %0, ptr %byval-temp, align 16 + call void @passCallee_int_16(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_int_16(ptr noundef dead_on_return) + +define void @loadAndPass_int_32() { +; CHECK-LABEL: loadAndPass_int_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -56 +; CHECK-NEXT: la %r2, 184(%r1) +; CHECK-NEXT: nill %r2, 65504 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global_int_32@GOT +; CHECK-NEXT: mvc 24(8,%r2), 24(%r1) +; CHECK-NEXT: mvc 16(8,%r2), 16(%r1) +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee_int_32@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <8 x i32>, align 32 + %0 = load <8 x i32>, ptr @global_int_32, align 32 + store <8 x i32> %0, ptr %byval-temp, align 32 + call void @passCallee_int_32(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_int_32(ptr noundef dead_on_return) + +define void @loadAndPass_long_8() { +; CHECK-LABEL: loadAndPass_long_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: lgrl %r1, global_long_8@GOT +; CHECK-NEXT: mvc 160(8,%r15), 0(%r1) +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, passCallee_long_8@PLT +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <1 x i64>, align 8 + %0 = load <1 x i64>, ptr @global_long_8, align 8 + store <1 x i64> %0, ptr %byval-temp, align 8 + call void @passCallee_long_8(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_long_8(ptr noundef dead_on_return) + +define void @loadAndPass_long_16() { +; CHECK-LABEL: loadAndPass_long_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r2, 168(%r1) +; CHECK-NEXT: nill %r2, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global_long_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee_long_16@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <2 x i64>, align 16 + %0 = load <2 x i64>, ptr @global_long_16, align 16 + store <2 x i64> %0, ptr %byval-temp, align 16 + call void @passCallee_long_16(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_long_16(ptr noundef dead_on_return) + +define void @loadAndPass___int128_16() { +; CHECK-LABEL: loadAndPass___int128_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r2, 168(%r1) +; CHECK-NEXT: nill %r2, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global___int128_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee___int128_16@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <1 x i128>, align 16 + %0 = load <1 x i128>, ptr @global___int128_16, align 16 + store <1 x i128> %0, ptr %byval-temp, align 16 + call void @passCallee___int128_16(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee___int128_16(ptr noundef dead_on_return) + +define void @loadAndPass___int128_32() { +; CHECK-LABEL: loadAndPass___int128_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -56 +; CHECK-NEXT: la %r2, 184(%r1) +; CHECK-NEXT: nill %r2, 65504 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global___int128_32@GOT +; CHECK-NEXT: mvc 24(8,%r2), 24(%r1) +; CHECK-NEXT: mvc 16(8,%r2), 16(%r1) +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee___int128_32@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <2 x i128>, align 32 + %0 = load <2 x i128>, ptr @global___int128_32, align 32 + store <2 x i128> %0, ptr %byval-temp, align 32 + call void @passCallee___int128_32(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee___int128_32(ptr noundef dead_on_return) + +define void @loadAndPass__Float16_2() { +; CHECK-LABEL: loadAndPass__Float16_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: lgrl %r1, global__Float16_2@GOT +; CHECK-NEXT: lgh %r0, 0(%r1) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: la %r2, 166(%r15) +; CHECK-NEXT: sth %r0, 166(%r15) +; CHECK-NEXT: brasl %r14, passCallee__Float16_2@PLT +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <1 x half>, align 2 + %0 = load <1 x half>, ptr @global__Float16_2, align 2 + store <1 x half> %0, ptr %byval-temp, align 2 + call void @passCallee__Float16_2(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee__Float16_2(ptr noundef dead_on_return) + +define void @loadAndPass__Float16_8() { +; CHECK-LABEL: loadAndPass__Float16_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: lgrl %r1, global__Float16_8@GOT +; CHECK-NEXT: mvc 160(8,%r15), 0(%r1) +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, passCallee__Float16_8@PLT +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <4 x half>, align 8 + %0 = load <4 x half>, ptr @global__Float16_8, align 8 + store <4 x half> %0, ptr %byval-temp, align 8 + call void @passCallee__Float16_8(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee__Float16_8(ptr noundef dead_on_return) + +define void @loadAndPass__Float16_16() { +; CHECK-LABEL: loadAndPass__Float16_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r2, 168(%r1) +; CHECK-NEXT: nill %r2, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global__Float16_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee__Float16_16@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <8 x half>, align 16 + %0 = load <8 x half>, ptr @global__Float16_16, align 16 + store <8 x half> %0, ptr %byval-temp, align 16 + call void @passCallee__Float16_16(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee__Float16_16(ptr noundef dead_on_return) + +define void @loadAndPass__Float16_32() { +; CHECK-LABEL: loadAndPass__Float16_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -56 +; CHECK-NEXT: la %r2, 184(%r1) +; CHECK-NEXT: nill %r2, 65504 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global__Float16_32@GOT +; CHECK-NEXT: mvc 24(8,%r2), 24(%r1) +; CHECK-NEXT: mvc 16(8,%r2), 16(%r1) +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee__Float16_32@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <16 x half>, align 32 + %0 = load <16 x half>, ptr @global__Float16_32, align 32 + store <16 x half> %0, ptr %byval-temp, align 32 + call void @passCallee__Float16_32(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee__Float16_32(ptr noundef dead_on_return) + +define void @loadAndPass_float_4() { +; CHECK-LABEL: loadAndPass_float_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: lgrl %r1, global_float_4@GOT +; CHECK-NEXT: mvc 164(4,%r15), 0(%r1) +; CHECK-NEXT: la %r2, 164(%r15) +; CHECK-NEXT: brasl %r14, passCallee_float_4@PLT +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <1 x float>, align 4 + %0 = load <1 x float>, ptr @global_float_4, align 4 + store <1 x float> %0, ptr %byval-temp, align 4 + call void @passCallee_float_4(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_float_4(ptr noundef dead_on_return) + +define void @loadAndPass_float_8() { +; CHECK-LABEL: loadAndPass_float_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: lgrl %r1, global_float_8@GOT +; CHECK-NEXT: mvc 160(8,%r15), 0(%r1) +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, passCallee_float_8@PLT +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <2 x float>, align 8 + %0 = load <2 x float>, ptr @global_float_8, align 8 + store <2 x float> %0, ptr %byval-temp, align 8 + call void @passCallee_float_8(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_float_8(ptr noundef dead_on_return) + +define void @loadAndPass_float_16() { +; CHECK-LABEL: loadAndPass_float_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r2, 168(%r1) +; CHECK-NEXT: nill %r2, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global_float_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee_float_16@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <4 x float>, align 16 + %0 = load <4 x float>, ptr @global_float_16, align 16 + store <4 x float> %0, ptr %byval-temp, align 16 + call void @passCallee_float_16(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_float_16(ptr noundef dead_on_return) + +define void @loadAndPass_double_8() { +; CHECK-LABEL: loadAndPass_double_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: lgrl %r1, global_double_8@GOT +; CHECK-NEXT: mvc 160(8,%r15), 0(%r1) +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, passCallee_double_8@PLT +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <1 x double>, align 8 + %0 = load <1 x double>, ptr @global_double_8, align 8 + store <1 x double> %0, ptr %byval-temp, align 8 + call void @passCallee_double_8(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_double_8(ptr noundef dead_on_return) + +define void @loadAndPass_double_16() { +; CHECK-LABEL: loadAndPass_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r2, 168(%r1) +; CHECK-NEXT: nill %r2, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global_double_16@GOT +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee_double_16@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <2 x double>, align 16 + %0 = load <2 x double>, ptr @global_double_16, align 16 + store <2 x double> %0, ptr %byval-temp, align 16 + call void @passCallee_double_16(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_double_16(ptr noundef dead_on_return) + +define void @loadAndPass_double_32() { +; CHECK-LABEL: loadAndPass_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -56 +; CHECK-NEXT: la %r2, 184(%r1) +; CHECK-NEXT: nill %r2, 65504 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global_double_32@GOT +; CHECK-NEXT: mvc 24(8,%r2), 24(%r1) +; CHECK-NEXT: mvc 16(8,%r2), 16(%r1) +; CHECK-NEXT: mvc 8(8,%r2), 8(%r1) +; CHECK-NEXT: mvc 0(8,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee_double_32@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <4 x double>, align 32 + %0 = load <4 x double>, ptr @global_double_32, align 32 + store <4 x double> %0, ptr %byval-temp, align 32 + call void @passCallee_double_32(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_double_32(ptr noundef dead_on_return) + +define void @loadAndPass_long_double_16() { +; CHECK-LABEL: loadAndPass_long_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r2, 168(%r1) +; CHECK-NEXT: nill %r2, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global_long_double_16@GOT +; CHECK-NEXT: mvc 0(16,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee_long_double_16@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <1 x fp128>, align 16 + %0 = load <1 x fp128>, ptr @global_long_double_16, align 16 + store <1 x fp128> %0, ptr %byval-temp, align 16 + call void @passCallee_long_double_16(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_long_double_16(ptr noundef dead_on_return) + +define void @loadAndPass_long_double_32() { +; CHECK-LABEL: loadAndPass_long_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -56 +; CHECK-NEXT: la %r2, 184(%r1) +; CHECK-NEXT: nill %r2, 65504 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgrl %r1, global_long_double_32@GOT +; CHECK-NEXT: mvc 16(16,%r2), 16(%r1) +; CHECK-NEXT: mvc 0(16,%r2), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee_long_double_32@PLT +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <2 x fp128>, align 32 + %0 = load <2 x fp128>, ptr @global_long_double_32, align 32 + store <2 x fp128> %0, ptr %byval-temp, align 32 + call void @passCallee_long_double_32(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_long_double_32(ptr noundef dead_on_return) + +define void @receiveAndStore_char_1() { +; CHECK-LABEL: receiveAndStore_char_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: la %r2, 167(%r15) +; CHECK-NEXT: brasl %r14, retCallee_char_1@PLT +; CHECK-NEXT: lgrl %r1, global_char_1@GOT +; CHECK-NEXT: mvc 0(1,%r1), 167(%r15) +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <1 x i8>, align 1 + call void @retCallee_char_1(ptr dead_on_unwind nonnull writable sret(<1 x i8>) align 1 %tmp) + %0 = load <1 x i8>, ptr %tmp, align 1 + store <1 x i8> %0, ptr @global_char_1, align 2 + ret void +} + +declare void @retCallee_char_1(ptr dead_on_unwind writable sret(<1 x i8>) align 1) + +define void @receiveAndStore_char_8() { +; CHECK-LABEL: receiveAndStore_char_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, retCallee_char_8@PLT +; CHECK-NEXT: lgrl %r1, global_char_8@GOT +; CHECK-NEXT: mvc 0(8,%r1), 160(%r15) +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <8 x i8>, align 8 + call void @retCallee_char_8(ptr dead_on_unwind nonnull writable sret(<8 x i8>) align 8 %tmp) + %0 = load <8 x i8>, ptr %tmp, align 8 + store <8 x i8> %0, ptr @global_char_8, align 8 + ret void +} + +declare void @retCallee_char_8(ptr dead_on_unwind writable sret(<8 x i8>) align 8) + +define void @receiveAndStore_char_16() { +; CHECK-LABEL: receiveAndStore_char_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r13, 168(%r1) +; CHECK-NEXT: nill %r13, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee_char_16@PLT +; CHECK-NEXT: lgrl %r1, global_char_16@GOT +; CHECK-NEXT: mvc 8(8,%r1), 8(%r13) +; CHECK-NEXT: mvc 0(8,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <16 x i8>, align 16 + call void @retCallee_char_16(ptr dead_on_unwind nonnull writable sret(<16 x i8>) align 16 %tmp) + %0 = load <16 x i8>, ptr %tmp, align 16 + store <16 x i8> %0, ptr @global_char_16, align 16 + ret void +} + +declare void @retCallee_char_16(ptr dead_on_unwind writable sret(<16 x i8>) align 16) + +define void @receiveAndStore_char_32() { +; CHECK-LABEL: receiveAndStore_char_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -56 +; CHECK-NEXT: la %r13, 184(%r1) +; CHECK-NEXT: nill %r13, 65504 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee_char_32@PLT +; CHECK-NEXT: lgrl %r1, global_char_32@GOT +; CHECK-NEXT: mvc 24(8,%r1), 24(%r13) +; CHECK-NEXT: mvc 16(8,%r1), 16(%r13) +; CHECK-NEXT: mvc 8(8,%r1), 8(%r13) +; CHECK-NEXT: mvc 0(8,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <32 x i8>, align 32 + call void @retCallee_char_32(ptr dead_on_unwind nonnull writable sret(<32 x i8>) align 32 %tmp) + %0 = load <32 x i8>, ptr %tmp, align 32 + store <32 x i8> %0, ptr @global_char_32, align 32 + ret void +} + +declare void @retCallee_char_32(ptr dead_on_unwind writable sret(<32 x i8>) align 32) + +define void @receiveAndStore_short_2() { +; CHECK-LABEL: receiveAndStore_short_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: la %r2, 166(%r15) +; CHECK-NEXT: brasl %r14, retCallee_short_2@PLT +; CHECK-NEXT: lgrl %r1, global_short_2@GOT +; CHECK-NEXT: mvc 0(2,%r1), 166(%r15) +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <1 x i16>, align 2 + call void @retCallee_short_2(ptr dead_on_unwind nonnull writable sret(<1 x i16>) align 2 %tmp) + %0 = load <1 x i16>, ptr %tmp, align 2 + store <1 x i16> %0, ptr @global_short_2, align 2 + ret void +} + +declare void @retCallee_short_2(ptr dead_on_unwind writable sret(<1 x i16>) align 2) + +define void @receiveAndStore_short_8() { +; CHECK-LABEL: receiveAndStore_short_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, retCallee_short_8@PLT +; CHECK-NEXT: lgrl %r1, global_short_8@GOT +; CHECK-NEXT: mvc 0(8,%r1), 160(%r15) +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <4 x i16>, align 8 + call void @retCallee_short_8(ptr dead_on_unwind nonnull writable sret(<4 x i16>) align 8 %tmp) + %0 = load <4 x i16>, ptr %tmp, align 8 + store <4 x i16> %0, ptr @global_short_8, align 8 + ret void +} + +declare void @retCallee_short_8(ptr dead_on_unwind writable sret(<4 x i16>) align 8) + +define void @receiveAndStore_short_16() { +; CHECK-LABEL: receiveAndStore_short_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r13, 168(%r1) +; CHECK-NEXT: nill %r13, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee_short_16@PLT +; CHECK-NEXT: lgrl %r1, global_short_16@GOT +; CHECK-NEXT: mvc 8(8,%r1), 8(%r13) +; CHECK-NEXT: mvc 0(8,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <8 x i16>, align 16 + call void @retCallee_short_16(ptr dead_on_unwind nonnull writable sret(<8 x i16>) align 16 %tmp) + %0 = load <8 x i16>, ptr %tmp, align 16 + store <8 x i16> %0, ptr @global_short_16, align 16 + ret void +} + +declare void @retCallee_short_16(ptr dead_on_unwind writable sret(<8 x i16>) align 16) + +define void @receiveAndStore_int_4() { +; CHECK-LABEL: receiveAndStore_int_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: la %r2, 164(%r15) +; CHECK-NEXT: brasl %r14, retCallee_int_4@PLT +; CHECK-NEXT: lgrl %r1, global_int_4@GOT +; CHECK-NEXT: mvc 0(4,%r1), 164(%r15) +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <1 x i32>, align 4 + call void @retCallee_int_4(ptr dead_on_unwind nonnull writable sret(<1 x i32>) align 4 %tmp) + %0 = load <1 x i32>, ptr %tmp, align 4 + store <1 x i32> %0, ptr @global_int_4, align 4 + ret void +} + +declare void @retCallee_int_4(ptr dead_on_unwind writable sret(<1 x i32>) align 4) + +define void @receiveAndStore_int_8() { +; CHECK-LABEL: receiveAndStore_int_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, retCallee_int_8@PLT +; CHECK-NEXT: lgrl %r1, global_int_8@GOT +; CHECK-NEXT: mvc 0(8,%r1), 160(%r15) +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <2 x i32>, align 8 + call void @retCallee_int_8(ptr dead_on_unwind nonnull writable sret(<2 x i32>) align 8 %tmp) + %0 = load <2 x i32>, ptr %tmp, align 8 + store <2 x i32> %0, ptr @global_int_8, align 8 + ret void +} + +declare void @retCallee_int_8(ptr dead_on_unwind writable sret(<2 x i32>) align 8) + +define void @receiveAndStore_int_16() { +; CHECK-LABEL: receiveAndStore_int_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r13, 168(%r1) +; CHECK-NEXT: nill %r13, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee_int_16@PLT +; CHECK-NEXT: lgrl %r1, global_int_16@GOT +; CHECK-NEXT: mvc 8(8,%r1), 8(%r13) +; CHECK-NEXT: mvc 0(8,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <4 x i32>, align 16 + call void @retCallee_int_16(ptr dead_on_unwind nonnull writable sret(<4 x i32>) align 16 %tmp) + %0 = load <4 x i32>, ptr %tmp, align 16 + store <4 x i32> %0, ptr @global_int_16, align 16 + ret void +} + +declare void @retCallee_int_16(ptr dead_on_unwind writable sret(<4 x i32>) align 16) + +define void @receiveAndStore_int_32() { +; CHECK-LABEL: receiveAndStore_int_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -56 +; CHECK-NEXT: la %r13, 184(%r1) +; CHECK-NEXT: nill %r13, 65504 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee_int_32@PLT +; CHECK-NEXT: lgrl %r1, global_int_32@GOT +; CHECK-NEXT: mvc 24(8,%r1), 24(%r13) +; CHECK-NEXT: mvc 16(8,%r1), 16(%r13) +; CHECK-NEXT: mvc 8(8,%r1), 8(%r13) +; CHECK-NEXT: mvc 0(8,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <8 x i32>, align 32 + call void @retCallee_int_32(ptr dead_on_unwind nonnull writable sret(<8 x i32>) align 32 %tmp) + %0 = load <8 x i32>, ptr %tmp, align 32 + store <8 x i32> %0, ptr @global_int_32, align 32 + ret void +} + +declare void @retCallee_int_32(ptr dead_on_unwind writable sret(<8 x i32>) align 32) + +define void @receiveAndStore_long_8() { +; CHECK-LABEL: receiveAndStore_long_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, retCallee_long_8@PLT +; CHECK-NEXT: lgrl %r1, global_long_8@GOT +; CHECK-NEXT: mvc 0(8,%r1), 160(%r15) +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <1 x i64>, align 8 + call void @retCallee_long_8(ptr dead_on_unwind nonnull writable sret(<1 x i64>) align 8 %tmp) + %0 = load <1 x i64>, ptr %tmp, align 8 + store <1 x i64> %0, ptr @global_long_8, align 8 + ret void +} + +declare void @retCallee_long_8(ptr dead_on_unwind writable sret(<1 x i64>) align 8) + +define void @receiveAndStore_long_16() { +; CHECK-LABEL: receiveAndStore_long_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r13, 168(%r1) +; CHECK-NEXT: nill %r13, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee_long_16@PLT +; CHECK-NEXT: lgrl %r1, global_long_16@GOT +; CHECK-NEXT: mvc 8(8,%r1), 8(%r13) +; CHECK-NEXT: mvc 0(8,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <2 x i64>, align 16 + call void @retCallee_long_16(ptr dead_on_unwind nonnull writable sret(<2 x i64>) align 16 %tmp) + %0 = load <2 x i64>, ptr %tmp, align 16 + store <2 x i64> %0, ptr @global_long_16, align 16 + ret void +} + +declare void @retCallee_long_16(ptr dead_on_unwind writable sret(<2 x i64>) align 16) + +define void @receiveAndStore___int128_16() { +; CHECK-LABEL: receiveAndStore___int128_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r13, 168(%r1) +; CHECK-NEXT: nill %r13, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee___int128_16@PLT +; CHECK-NEXT: lgrl %r1, global___int128_16@GOT +; CHECK-NEXT: mvc 8(8,%r1), 8(%r13) +; CHECK-NEXT: mvc 0(8,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <1 x i128>, align 16 + call void @retCallee___int128_16(ptr dead_on_unwind nonnull writable sret(<1 x i128>) align 16 %tmp) + %0 = load <1 x i128>, ptr %tmp, align 16 + store <1 x i128> %0, ptr @global___int128_16, align 16 + ret void +} + +declare void @retCallee___int128_16(ptr dead_on_unwind writable sret(<1 x i128>) align 16) + +define void @receiveAndStore___int128_32() { +; CHECK-LABEL: receiveAndStore___int128_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -56 +; CHECK-NEXT: la %r13, 184(%r1) +; CHECK-NEXT: nill %r13, 65504 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee___int128_32@PLT +; CHECK-NEXT: lgrl %r1, global___int128_32@GOT +; CHECK-NEXT: mvc 24(8,%r1), 24(%r13) +; CHECK-NEXT: mvc 16(8,%r1), 16(%r13) +; CHECK-NEXT: mvc 8(8,%r1), 8(%r13) +; CHECK-NEXT: mvc 0(8,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <2 x i128>, align 32 + call void @retCallee___int128_32(ptr dead_on_unwind nonnull writable sret(<2 x i128>) align 32 %tmp) + %0 = load <2 x i128>, ptr %tmp, align 32 + store <2 x i128> %0, ptr @global___int128_32, align 32 + ret void +} + +declare void @retCallee___int128_32(ptr dead_on_unwind writable sret(<2 x i128>) align 32) + +define void @receiveAndStore__Float16_2() { +; CHECK-LABEL: receiveAndStore__Float16_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: la %r2, 166(%r15) +; CHECK-NEXT: brasl %r14, retCallee__Float16_2@PLT +; CHECK-NEXT: lgh %r0, 166(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgrl %r1, global__Float16_2@GOT +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r1) +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <1 x half>, align 2 + call void @retCallee__Float16_2(ptr dead_on_unwind nonnull writable sret(<1 x half>) align 2 %tmp) + %0 = load <1 x half>, ptr %tmp, align 2 + store <1 x half> %0, ptr @global__Float16_2, align 2 + ret void +} + +declare void @retCallee__Float16_2(ptr dead_on_unwind writable sret(<1 x half>) align 2) + +define void @receiveAndStore__Float16_8() { +; CHECK-LABEL: receiveAndStore__Float16_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, retCallee__Float16_8@PLT +; CHECK-NEXT: lgrl %r1, global__Float16_8@GOT +; CHECK-NEXT: mvc 0(8,%r1), 160(%r15) +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <4 x half>, align 8 + call void @retCallee__Float16_8(ptr dead_on_unwind nonnull writable sret(<4 x half>) align 8 %tmp) + %0 = load <4 x half>, ptr %tmp, align 8 + store <4 x half> %0, ptr @global__Float16_8, align 8 + ret void +} + +declare void @retCallee__Float16_8(ptr dead_on_unwind writable sret(<4 x half>) align 8) + +define void @receiveAndStore__Float16_16() { +; CHECK-LABEL: receiveAndStore__Float16_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r13, 168(%r1) +; CHECK-NEXT: nill %r13, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee__Float16_16@PLT +; CHECK-NEXT: lgrl %r1, global__Float16_16@GOT +; CHECK-NEXT: mvc 8(8,%r1), 8(%r13) +; CHECK-NEXT: mvc 0(8,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <8 x half>, align 16 + call void @retCallee__Float16_16(ptr dead_on_unwind nonnull writable sret(<8 x half>) align 16 %tmp) + %0 = load <8 x half>, ptr %tmp, align 16 + store <8 x half> %0, ptr @global__Float16_16, align 16 + ret void +} + +declare void @retCallee__Float16_16(ptr dead_on_unwind writable sret(<8 x half>) align 16) + +define void @receiveAndStore__Float16_32() { +; CHECK-LABEL: receiveAndStore__Float16_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -56 +; CHECK-NEXT: la %r13, 184(%r1) +; CHECK-NEXT: nill %r13, 65504 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee__Float16_32@PLT +; CHECK-NEXT: lgrl %r1, global__Float16_32@GOT +; CHECK-NEXT: mvc 24(8,%r1), 24(%r13) +; CHECK-NEXT: mvc 16(8,%r1), 16(%r13) +; CHECK-NEXT: mvc 8(8,%r1), 8(%r13) +; CHECK-NEXT: mvc 0(8,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <16 x half>, align 32 + call void @retCallee__Float16_32(ptr dead_on_unwind nonnull writable sret(<16 x half>) align 32 %tmp) + %0 = load <16 x half>, ptr %tmp, align 32 + store <16 x half> %0, ptr @global__Float16_32, align 32 + ret void +} + +declare void @retCallee__Float16_32(ptr dead_on_unwind writable sret(<16 x half>) align 32) + +define void @receiveAndStore_float_4() { +; CHECK-LABEL: receiveAndStore_float_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: la %r2, 164(%r15) +; CHECK-NEXT: brasl %r14, retCallee_float_4@PLT +; CHECK-NEXT: lgrl %r1, global_float_4@GOT +; CHECK-NEXT: mvc 0(4,%r1), 164(%r15) +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <1 x float>, align 4 + call void @retCallee_float_4(ptr dead_on_unwind nonnull writable sret(<1 x float>) align 4 %tmp) + %0 = load <1 x float>, ptr %tmp, align 4 + store <1 x float> %0, ptr @global_float_4, align 4 + ret void +} + +declare void @retCallee_float_4(ptr dead_on_unwind writable sret(<1 x float>) align 4) + +define void @receiveAndStore_float_8() { +; CHECK-LABEL: receiveAndStore_float_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, retCallee_float_8@PLT +; CHECK-NEXT: lgrl %r1, global_float_8@GOT +; CHECK-NEXT: mvc 0(8,%r1), 160(%r15) +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <2 x float>, align 8 + call void @retCallee_float_8(ptr dead_on_unwind nonnull writable sret(<2 x float>) align 8 %tmp) + %0 = load <2 x float>, ptr %tmp, align 8 + store <2 x float> %0, ptr @global_float_8, align 8 + ret void +} + +declare void @retCallee_float_8(ptr dead_on_unwind writable sret(<2 x float>) align 8) + +define void @receiveAndStore_float_16() { +; CHECK-LABEL: receiveAndStore_float_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r13, 168(%r1) +; CHECK-NEXT: nill %r13, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee_float_16@PLT +; CHECK-NEXT: lgrl %r1, global_float_16@GOT +; CHECK-NEXT: mvc 8(8,%r1), 8(%r13) +; CHECK-NEXT: mvc 0(8,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <4 x float>, align 16 + call void @retCallee_float_16(ptr dead_on_unwind nonnull writable sret(<4 x float>) align 16 %tmp) + %0 = load <4 x float>, ptr %tmp, align 16 + store <4 x float> %0, ptr @global_float_16, align 16 + ret void +} + +declare void @retCallee_float_16(ptr dead_on_unwind writable sret(<4 x float>) align 16) + +define void @receiveAndStore_double_8() { +; CHECK-LABEL: receiveAndStore_double_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -168 +; CHECK-NEXT: .cfi_def_cfa_offset 328 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, retCallee_double_8@PLT +; CHECK-NEXT: lgrl %r1, global_double_8@GOT +; CHECK-NEXT: mvc 0(8,%r1), 160(%r15) +; CHECK-NEXT: lmg %r14, %r15, 280(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <1 x double>, align 8 + call void @retCallee_double_8(ptr dead_on_unwind nonnull writable sret(<1 x double>) align 8 %tmp) + %0 = load <1 x double>, ptr %tmp, align 8 + store <1 x double> %0, ptr @global_double_8, align 8 + ret void +} + +declare void @retCallee_double_8(ptr dead_on_unwind writable sret(<1 x double>) align 8) + +define void @receiveAndStore_double_16() { +; CHECK-LABEL: receiveAndStore_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r13, 168(%r1) +; CHECK-NEXT: nill %r13, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee_double_16@PLT +; CHECK-NEXT: lgrl %r1, global_double_16@GOT +; CHECK-NEXT: mvc 8(8,%r1), 8(%r13) +; CHECK-NEXT: mvc 0(8,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <2 x double>, align 16 + call void @retCallee_double_16(ptr dead_on_unwind nonnull writable sret(<2 x double>) align 16 %tmp) + %0 = load <2 x double>, ptr %tmp, align 16 + store <2 x double> %0, ptr @global_double_16, align 16 + ret void +} + +declare void @retCallee_double_16(ptr dead_on_unwind writable sret(<2 x double>) align 16) + +define void @receiveAndStore_double_32() { +; CHECK-LABEL: receiveAndStore_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -56 +; CHECK-NEXT: la %r13, 184(%r1) +; CHECK-NEXT: nill %r13, 65504 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee_double_32@PLT +; CHECK-NEXT: lgrl %r1, global_double_32@GOT +; CHECK-NEXT: mvc 24(8,%r1), 24(%r13) +; CHECK-NEXT: mvc 16(8,%r1), 16(%r13) +; CHECK-NEXT: mvc 8(8,%r1), 8(%r13) +; CHECK-NEXT: mvc 0(8,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <4 x double>, align 32 + call void @retCallee_double_32(ptr dead_on_unwind nonnull writable sret(<4 x double>) align 32 %tmp) + %0 = load <4 x double>, ptr %tmp, align 32 + store <4 x double> %0, ptr @global_double_32, align 32 + ret void +} + +declare void @retCallee_double_32(ptr dead_on_unwind writable sret(<4 x double>) align 32) + +define void @receiveAndStore_long_double_16() { +; CHECK-LABEL: receiveAndStore_long_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -24 +; CHECK-NEXT: la %r13, 168(%r1) +; CHECK-NEXT: nill %r13, 65520 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee_long_double_16@PLT +; CHECK-NEXT: lgrl %r1, global_long_double_16@GOT +; CHECK-NEXT: mvc 0(16,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <1 x fp128>, align 16 + call void @retCallee_long_double_16(ptr dead_on_unwind nonnull writable sret(<1 x fp128>) align 16 %tmp) + %0 = load <1 x fp128>, ptr %tmp, align 16 + store <1 x fp128> %0, ptr @global_long_double_16, align 16 + ret void +} + +declare void @retCallee_long_double_16(ptr dead_on_unwind writable sret(<1 x fp128>) align 16) + +define void @receiveAndStore_long_double_32() { +; CHECK-LABEL: receiveAndStore_long_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r11, %r15, 88(%r15) +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: lgr %r11, %r15 +; CHECK-NEXT: .cfi_def_cfa_register %r11 +; CHECK-NEXT: lgr %r1, %r15 +; CHECK-NEXT: aghi %r1, -56 +; CHECK-NEXT: la %r13, 184(%r1) +; CHECK-NEXT: nill %r13, 65504 +; CHECK-NEXT: lgr %r15, %r1 +; CHECK-NEXT: lgr %r2, %r13 +; CHECK-NEXT: brasl %r14, retCallee_long_double_32@PLT +; CHECK-NEXT: lgrl %r1, global_long_double_32@GOT +; CHECK-NEXT: mvc 16(16,%r1), 16(%r13) +; CHECK-NEXT: mvc 0(16,%r1), 0(%r13) +; CHECK-NEXT: lmg %r11, %r15, 248(%r11) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <2 x fp128>, align 32 + call void @retCallee_long_double_32(ptr dead_on_unwind nonnull writable sret(<2 x fp128>) align 32 %tmp) + %0 = load <2 x fp128>, ptr %tmp, align 32 + store <2 x fp128> %0, ptr @global_long_double_32, align 32 + ret void +} + +declare void @retCallee_long_double_32(ptr dead_on_unwind writable sret(<2 x fp128>) align 32) diff --git a/llvm/test/CodeGen/SystemZ/vec-abi-02.ll b/llvm/test/CodeGen/SystemZ/vec-abi-02.ll new file mode 100644 index 0000000000000..c57a9d6b3378a --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-abi-02.ll @@ -0,0 +1,1751 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s +; +; Test passing vector arguments per the ABI for z16 (with vector support). +; The function names codify the element type and the size of the vector in +; bytes, just like in the clang test systemz-abi-vector.c + +@global_char_1 = global <1 x i8> zeroinitializer, align 2 +@global_char_8 = global <8 x i8> zeroinitializer, align 8 +@global_char_16 = global <16 x i8> zeroinitializer, align 8 +@global_char_32 = global <32 x i8> zeroinitializer, align 8 +@global_short_2 = global <1 x i16> zeroinitializer, align 2 +@global_short_8 = global <4 x i16> zeroinitializer, align 8 +@global_short_16 = global <8 x i16> zeroinitializer, align 8 +@global_int_4 = global <1 x i32> zeroinitializer, align 4 +@global_int_8 = global <2 x i32> zeroinitializer, align 8 +@global_int_16 = global <4 x i32> zeroinitializer, align 8 +@global_int_32 = global <8 x i32> zeroinitializer, align 8 +@global_long_8 = global <1 x i64> zeroinitializer, align 8 +@global_long_16 = global <2 x i64> zeroinitializer, align 8 +@global___int128_16 = global <1 x i128> zeroinitializer, align 8 +@global___int128_32 = global <2 x i128> zeroinitializer, align 8 +@global__Float16_2 = global <1 x half> zeroinitializer, align 2 +@global__Float16_8 = global <4 x half> zeroinitializer, align 8 +@global__Float16_16 = global <8 x half> zeroinitializer, align 8 +@global__Float16_32 = global <16 x half> zeroinitializer, align 8 +@global_float_4 = global <1 x float> zeroinitializer, align 4 +@global_float_8 = global <2 x float> zeroinitializer, align 8 +@global_float_16 = global <4 x float> zeroinitializer, align 8 +@global_double_8 = global <1 x double> zeroinitializer, align 8 +@global_double_16 = global <2 x double> zeroinitializer, align 8 +@global_double_32 = global <4 x double> zeroinitializer, align 8 +@global_long_double_16 = global <1 x fp128> zeroinitializer, align 8 +@global_long_double_32 = global <2 x fp128> zeroinitializer, align 8 + +define void @takeAndStore_char_1(<1 x i8> noundef %x) { +; CHECK-LABEL: takeAndStore_char_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_1@GOT +; CHECK-NEXT: vsteb %v24, 0(%r1), 0 +; CHECK-NEXT: br %r14 +entry: + store <1 x i8> %x, ptr @global_char_1, align 2 + ret void +} + +define void @takeAndStore_char_8(<8 x i8> noundef %x) { +; CHECK-LABEL: takeAndStore_char_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_8@GOT +; CHECK-NEXT: vsteg %v24, 0(%r1), 0 +; CHECK-NEXT: br %r14 +entry: + store <8 x i8> %x, ptr @global_char_8, align 8 + ret void +} + +define void @takeAndStore_char_16(<16 x i8> noundef %x) { +; CHECK-LABEL: takeAndStore_char_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + store <16 x i8> %x, ptr @global_char_16, align 8 + ret void +} + +define void @takeAndStore_char_32(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_char_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: lgrl %r1, global_char_32@GOT +; CHECK-NEXT: vst %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %x = load <32 x i8>, ptr %0, align 8 + store <32 x i8> %x, ptr @global_char_32, align 8 + ret void +} + +define void @takeAndStore_short_2(<1 x i16> noundef %x) { +; CHECK-LABEL: takeAndStore_short_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_short_2@GOT +; CHECK-NEXT: vsteh %v24, 0(%r1), 0 +; CHECK-NEXT: br %r14 +entry: + store <1 x i16> %x, ptr @global_short_2, align 2 + ret void +} + +define void @takeAndStore_short_8(<4 x i16> noundef %x) { +; CHECK-LABEL: takeAndStore_short_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_short_8@GOT +; CHECK-NEXT: vsteg %v24, 0(%r1), 0 +; CHECK-NEXT: br %r14 +entry: + store <4 x i16> %x, ptr @global_short_8, align 8 + ret void +} + +define void @takeAndStore_short_16(<8 x i16> noundef %x) { +; CHECK-LABEL: takeAndStore_short_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_short_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + store <8 x i16> %x, ptr @global_short_16, align 8 + ret void +} + +define void @takeAndStore_int_4(<1 x i32> noundef %x) { +; CHECK-LABEL: takeAndStore_int_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_4@GOT +; CHECK-NEXT: vstef %v24, 0(%r1), 0 +; CHECK-NEXT: br %r14 +entry: + store <1 x i32> %x, ptr @global_int_4, align 4 + ret void +} + +define void @takeAndStore_int_8(<2 x i32> noundef %x) { +; CHECK-LABEL: takeAndStore_int_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_8@GOT +; CHECK-NEXT: vsteg %v24, 0(%r1), 0 +; CHECK-NEXT: br %r14 +entry: + store <2 x i32> %x, ptr @global_int_8, align 8 + ret void +} + +define void @takeAndStore_int_16(<4 x i32> noundef %x) { +; CHECK-LABEL: takeAndStore_int_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + store <4 x i32> %x, ptr @global_int_16, align 8 + ret void +} + +define void @takeAndStore_int_32(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_int_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: lgrl %r1, global_int_32@GOT +; CHECK-NEXT: vst %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %x = load <8 x i32>, ptr %0, align 8 + store <8 x i32> %x, ptr @global_int_32, align 8 + ret void +} + +define void @takeAndStore_long_8(<1 x i64> noundef %x) { +; CHECK-LABEL: takeAndStore_long_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_8@GOT +; CHECK-NEXT: vsteg %v24, 0(%r1), 0 +; CHECK-NEXT: br %r14 +entry: + store <1 x i64> %x, ptr @global_long_8, align 8 + ret void +} + +define void @takeAndStore_long_16(<2 x i64> noundef %x) { +; CHECK-LABEL: takeAndStore_long_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + store <2 x i64> %x, ptr @global_long_16, align 8 + ret void +} + +define void @takeAndStore___int128_16(<1 x i128> noundef %x) { +; CHECK-LABEL: takeAndStore___int128_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global___int128_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + store <1 x i128> %x, ptr @global___int128_16, align 8 + ret void +} + +define void @takeAndStore___int128_32(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore___int128_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: lgrl %r1, global___int128_32@GOT +; CHECK-NEXT: vst %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %x = load <2 x i128>, ptr %0, align 8 + store <2 x i128> %x, ptr @global___int128_32, align 8 + ret void +} + +define void @takeAndStore__Float16_2(<1 x half> noundef %x) { +; CHECK-LABEL: takeAndStore__Float16_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_2@GOT +; CHECK-NEXT: vsteh %v24, 0(%r1), 0 +; CHECK-NEXT: br %r14 +entry: + store <1 x half> %x, ptr @global__Float16_2, align 2 + ret void +} + +define void @takeAndStore__Float16_8(<4 x half> noundef %x) { +; CHECK-LABEL: takeAndStore__Float16_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_8@GOT +; CHECK-NEXT: vsteh %v24, 6(%r1), 3 +; CHECK-NEXT: vsteh %v24, 4(%r1), 2 +; CHECK-NEXT: vsteh %v24, 2(%r1), 1 +; CHECK-NEXT: vsteh %v24, 0(%r1), 0 +; CHECK-NEXT: br %r14 +entry: + store <4 x half> %x, ptr @global__Float16_8, align 8 + ret void +} + +define void @takeAndStore__Float16_16(<8 x half> noundef %x) { +; CHECK-LABEL: takeAndStore__Float16_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + store <8 x half> %x, ptr @global__Float16_16, align 8 + ret void +} + +define void @takeAndStore__Float16_32(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore__Float16_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: lgrl %r1, global__Float16_32@GOT +; CHECK-NEXT: vst %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %x = load <16 x half>, ptr %0, align 8 + store <16 x half> %x, ptr @global__Float16_32, align 8 + ret void +} + +define void @takeAndStore_float_4(<1 x float> noundef %x) { +; CHECK-LABEL: takeAndStore_float_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_float_4@GOT +; CHECK-NEXT: vstef %v24, 0(%r1), 0 +; CHECK-NEXT: br %r14 +entry: + store <1 x float> %x, ptr @global_float_4, align 4 + ret void +} + +define void @takeAndStore_float_8(<2 x float> noundef %x) { +; CHECK-LABEL: takeAndStore_float_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_float_8@GOT +; CHECK-NEXT: vsteg %v24, 0(%r1), 0 +; CHECK-NEXT: br %r14 +entry: + store <2 x float> %x, ptr @global_float_8, align 8 + ret void +} + +define void @takeAndStore_float_16(<4 x float> noundef %x) { +; CHECK-LABEL: takeAndStore_float_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_float_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + store <4 x float> %x, ptr @global_float_16, align 8 + ret void +} + +define void @takeAndStore_double_8(<1 x double> noundef %x) { +; CHECK-LABEL: takeAndStore_double_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_double_8@GOT +; CHECK-NEXT: vsteg %v24, 0(%r1), 0 +; CHECK-NEXT: br %r14 +entry: + store <1 x double> %x, ptr @global_double_8, align 8 + ret void +} + +define void @takeAndStore_double_16(<2 x double> noundef %x) { +; CHECK-LABEL: takeAndStore_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_double_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + store <2 x double> %x, ptr @global_double_16, align 8 + ret void +} + +define void @takeAndStore_double_32(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: lgrl %r1, global_double_32@GOT +; CHECK-NEXT: vst %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %x = load <4 x double>, ptr %0, align 8 + store <4 x double> %x, ptr @global_double_32, align 8 + ret void +} + +define void @takeAndStore_long_double_16(<1 x fp128> noundef %x) { +; CHECK-LABEL: takeAndStore_long_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_double_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + store <1 x fp128> %x, ptr @global_long_double_16, align 8 + ret void +} + +define void @takeAndStore_long_double_32(ptr noundef readonly captures(none) dead_on_return %0) { +; CHECK-LABEL: takeAndStore_long_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vl %v1, 16(%r2), 3 +; CHECK-NEXT: lgrl %r1, global_long_double_32@GOT +; CHECK-NEXT: vst %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %x = load <2 x fp128>, ptr %0, align 8 + store <2 x fp128> %x, ptr @global_long_double_32, align 8 + ret void +} + +define <1 x i8> @loadAndReturn_char_1() { +; CHECK-LABEL: loadAndReturn_char_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_1@GOT +; CHECK-NEXT: vlrepb %v24, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x i8>, ptr @global_char_1, align 2 + ret <1 x i8> %0 +} + +define <8 x i8> @loadAndReturn_char_8() { +; CHECK-LABEL: loadAndReturn_char_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_8@GOT +; CHECK-NEXT: vlrepg %v24, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <8 x i8>, ptr @global_char_8, align 8 + ret <8 x i8> %0 +} + +define <16 x i8> @loadAndReturn_char_16() { +; CHECK-LABEL: loadAndReturn_char_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <16 x i8>, ptr @global_char_16, align 8 + ret <16 x i8> %0 +} + +define void @loadAndReturn_char_32(ptr dead_on_unwind noalias writable writeonly sret(<32 x i8>) align 8 captures(none) initializes((0, 32)) %agg.result) { +; CHECK-LABEL: loadAndReturn_char_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_32@GOT +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vl %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v1, 16(%r2), 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <32 x i8>, ptr @global_char_32, align 8 + store <32 x i8> %0, ptr %agg.result, align 8 + ret void +} + +define <1 x i16> @loadAndReturn_short_2() { +; CHECK-LABEL: loadAndReturn_short_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_short_2@GOT +; CHECK-NEXT: vlreph %v24, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x i16>, ptr @global_short_2, align 2 + ret <1 x i16> %0 +} + +define <4 x i16> @loadAndReturn_short_8() { +; CHECK-LABEL: loadAndReturn_short_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_short_8@GOT +; CHECK-NEXT: vlrepg %v24, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x i16>, ptr @global_short_8, align 8 + ret <4 x i16> %0 +} + +define <8 x i16> @loadAndReturn_short_16() { +; CHECK-LABEL: loadAndReturn_short_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_short_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <8 x i16>, ptr @global_short_16, align 8 + ret <8 x i16> %0 +} + +define <1 x i32> @loadAndReturn_int_4() { +; CHECK-LABEL: loadAndReturn_int_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_4@GOT +; CHECK-NEXT: vlrepf %v24, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x i32>, ptr @global_int_4, align 4 + ret <1 x i32> %0 +} + +define <2 x i32> @loadAndReturn_int_8() { +; CHECK-LABEL: loadAndReturn_int_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_8@GOT +; CHECK-NEXT: vlrepg %v24, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x i32>, ptr @global_int_8, align 8 + ret <2 x i32> %0 +} + +define <4 x i32> @loadAndReturn_int_16() { +; CHECK-LABEL: loadAndReturn_int_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x i32>, ptr @global_int_16, align 8 + ret <4 x i32> %0 +} + +define void @loadAndReturn_int_32(ptr dead_on_unwind noalias writable writeonly sret(<8 x i32>) align 8 captures(none) initializes((0, 32)) %agg.result) { +; CHECK-LABEL: loadAndReturn_int_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_32@GOT +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vl %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v1, 16(%r2), 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <8 x i32>, ptr @global_int_32, align 8 + store <8 x i32> %0, ptr %agg.result, align 8 + ret void +} + +define <1 x i64> @loadAndReturn_long_8() { +; CHECK-LABEL: loadAndReturn_long_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_8@GOT +; CHECK-NEXT: vlrepg %v24, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x i64>, ptr @global_long_8, align 8 + ret <1 x i64> %0 +} + +define <2 x i64> @loadAndReturn_long_16() { +; CHECK-LABEL: loadAndReturn_long_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x i64>, ptr @global_long_16, align 8 + ret <2 x i64> %0 +} + +define <1 x i128> @loadAndReturn___int128_16() { +; CHECK-LABEL: loadAndReturn___int128_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global___int128_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x i128>, ptr @global___int128_16, align 8 + ret <1 x i128> %0 +} + +define void @loadAndReturn___int128_32(ptr dead_on_unwind noalias writable writeonly sret(<2 x i128>) align 8 captures(none) initializes((0, 32)) %agg.result) { +; CHECK-LABEL: loadAndReturn___int128_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global___int128_32@GOT +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vl %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v1, 16(%r2), 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x i128>, ptr @global___int128_32, align 8 + store <2 x i128> %0, ptr %agg.result, align 8 + ret void +} + +define <1 x half> @loadAndReturn__Float16_2() { +; CHECK-LABEL: loadAndReturn__Float16_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_2@GOT +; CHECK-NEXT: vlreph %v24, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x half>, ptr @global__Float16_2, align 2 + ret <1 x half> %0 +} + +define <4 x half> @loadAndReturn__Float16_8() { +; CHECK-LABEL: loadAndReturn__Float16_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_8@GOT +; CHECK-NEXT: vlreph %v0, 0(%r1) +; CHECK-NEXT: vlreph %v1, 2(%r1) +; CHECK-NEXT: vlreph %v2, 4(%r1) +; CHECK-NEXT: vlreph %v3, 6(%r1) +; CHECK-NEXT: vmrhh %v2, %v2, %v3 +; CHECK-NEXT: vmrhh %v0, %v0, %v1 +; CHECK-NEXT: vmrhf %v0, %v0, %v2 +; CHECK-NEXT: vmrhg %v24, %v0, %v0 +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x half>, ptr @global__Float16_8, align 8 + ret <4 x half> %0 +} + +define <8 x half> @loadAndReturn__Float16_16() { +; CHECK-LABEL: loadAndReturn__Float16_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <8 x half>, ptr @global__Float16_16, align 8 + ret <8 x half> %0 +} + +define void @loadAndReturn__Float16_32(ptr dead_on_unwind noalias writable writeonly sret(<16 x half>) align 8 captures(none) initializes((0, 32)) %agg.result) { +; CHECK-LABEL: loadAndReturn__Float16_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_32@GOT +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vl %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v1, 16(%r2), 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <16 x half>, ptr @global__Float16_32, align 8 + store <16 x half> %0, ptr %agg.result, align 8 + ret void +} + +define <1 x float> @loadAndReturn_float_4() { +; CHECK-LABEL: loadAndReturn_float_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_float_4@GOT +; CHECK-NEXT: vlrepf %v24, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x float>, ptr @global_float_4, align 4 + ret <1 x float> %0 +} + +define <2 x float> @loadAndReturn_float_8() { +; CHECK-LABEL: loadAndReturn_float_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_float_8@GOT +; CHECK-NEXT: vlrepg %v24, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x float>, ptr @global_float_8, align 8 + ret <2 x float> %0 +} + +define <4 x float> @loadAndReturn_float_16() { +; CHECK-LABEL: loadAndReturn_float_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_float_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x float>, ptr @global_float_16, align 8 + ret <4 x float> %0 +} + +define <1 x double> @loadAndReturn_double_8() { +; CHECK-LABEL: loadAndReturn_double_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_double_8@GOT +; CHECK-NEXT: vlrepg %v24, 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x double>, ptr @global_double_8, align 8 + ret <1 x double> %0 +} + +define <2 x double> @loadAndReturn_double_16() { +; CHECK-LABEL: loadAndReturn_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_double_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x double>, ptr @global_double_16, align 8 + ret <2 x double> %0 +} + +define void @loadAndReturn_double_32(ptr dead_on_unwind noalias writable writeonly sret(<4 x double>) align 8 captures(none) initializes((0, 32)) %agg.result) { +; CHECK-LABEL: loadAndReturn_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_double_32@GOT +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vl %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v1, 16(%r2), 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <4 x double>, ptr @global_double_32, align 8 + store <4 x double> %0, ptr %agg.result, align 8 + ret void +} + +define <1 x fp128> @loadAndReturn_long_double_16() { +; CHECK-LABEL: loadAndReturn_long_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_double_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: br %r14 +entry: + %0 = load <1 x fp128>, ptr @global_long_double_16, align 8 + ret <1 x fp128> %0 +} + +define void @loadAndReturn_long_double_32(ptr dead_on_unwind noalias writable writeonly sret(<2 x fp128>) align 8 captures(none) initializes((0, 32)) %agg.result) { +; CHECK-LABEL: loadAndReturn_long_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_double_32@GOT +; CHECK-NEXT: mvc 16(16,%r2), 16(%r1) +; CHECK-NEXT: mvc 0(16,%r2), 0(%r1) +; CHECK-NEXT: br %r14 +entry: + %0 = load <2 x fp128>, ptr @global_long_double_32, align 8 + store <2 x fp128> %0, ptr %agg.result, align 8 + ret void +} + +define void @loadAndPass_char_1() { +; CHECK-LABEL: loadAndPass_char_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_1@GOT +; CHECK-NEXT: vlrepb %v24, 0(%r1) +; CHECK-NEXT: jg passCallee_char_1@PLT +entry: + %0 = load <1 x i8>, ptr @global_char_1, align 2 + tail call void @passCallee_char_1(<1 x i8> noundef %0) + ret void +} + +declare void @passCallee_char_1(<1 x i8> noundef) + +define void @loadAndPass_char_8() { +; CHECK-LABEL: loadAndPass_char_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_8@GOT +; CHECK-NEXT: vlrepg %v24, 0(%r1) +; CHECK-NEXT: jg passCallee_char_8@PLT +entry: + %0 = load <8 x i8>, ptr @global_char_8, align 8 + tail call void @passCallee_char_8(<8 x i8> noundef %0) + ret void +} + +declare void @passCallee_char_8(<8 x i8> noundef) + +define void @loadAndPass_char_16() { +; CHECK-LABEL: loadAndPass_char_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_char_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: jg passCallee_char_16@PLT +entry: + %0 = load <16 x i8>, ptr @global_char_16, align 8 + tail call void @passCallee_char_16(<16 x i8> noundef %0) + ret void +} + +declare void @passCallee_char_16(<16 x i8> noundef) + +define void @loadAndPass_char_32() { +; CHECK-LABEL: loadAndPass_char_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: lgrl %r1, global_char_32@GOT +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vl %v1, 16(%r1), 3 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: vst %v1, 176(%r15), 3 +; CHECK-NEXT: vst %v0, 160(%r15), 3 +; CHECK-NEXT: brasl %r14, passCallee_char_32@PLT +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <32 x i8>, align 8 + %0 = load <32 x i8>, ptr @global_char_32, align 8 + store <32 x i8> %0, ptr %byval-temp, align 8 + call void @passCallee_char_32(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_char_32(ptr noundef dead_on_return) + + + +define void @loadAndPass_short_2() { +; CHECK-LABEL: loadAndPass_short_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_short_2@GOT +; CHECK-NEXT: vlreph %v24, 0(%r1) +; CHECK-NEXT: jg passCallee_short_2@PLT +entry: + %0 = load <1 x i16>, ptr @global_short_2, align 2 + tail call void @passCallee_short_2(<1 x i16> noundef %0) + ret void +} + +declare void @passCallee_short_2(<1 x i16> noundef) + +define void @loadAndPass_short_8() { +; CHECK-LABEL: loadAndPass_short_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_short_8@GOT +; CHECK-NEXT: vlrepg %v24, 0(%r1) +; CHECK-NEXT: jg passCallee_short_8@PLT +entry: + %0 = load <4 x i16>, ptr @global_short_8, align 8 + tail call void @passCallee_short_8(<4 x i16> noundef %0) + ret void +} + +declare void @passCallee_short_8(<4 x i16> noundef) + +define void @loadAndPass_short_16() { +; CHECK-LABEL: loadAndPass_short_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_short_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: jg passCallee_short_16@PLT +entry: + %0 = load <8 x i16>, ptr @global_short_16, align 8 + tail call void @passCallee_short_16(<8 x i16> noundef %0) + ret void +} + +declare void @passCallee_short_16(<8 x i16> noundef) + +define void @loadAndPass_int_4() { +; CHECK-LABEL: loadAndPass_int_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_4@GOT +; CHECK-NEXT: vlrepf %v24, 0(%r1) +; CHECK-NEXT: jg passCallee_int_4@PLT +entry: + %0 = load <1 x i32>, ptr @global_int_4, align 4 + tail call void @passCallee_int_4(<1 x i32> noundef %0) + ret void +} + +declare void @passCallee_int_4(<1 x i32> noundef) + +define void @loadAndPass_int_8() { +; CHECK-LABEL: loadAndPass_int_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_8@GOT +; CHECK-NEXT: vlrepg %v24, 0(%r1) +; CHECK-NEXT: jg passCallee_int_8@PLT +entry: + %0 = load <2 x i32>, ptr @global_int_8, align 8 + tail call void @passCallee_int_8(<2 x i32> noundef %0) + ret void +} + +declare void @passCallee_int_8(<2 x i32> noundef) + +define void @loadAndPass_int_16() { +; CHECK-LABEL: loadAndPass_int_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_int_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: jg passCallee_int_16@PLT +entry: + %0 = load <4 x i32>, ptr @global_int_16, align 8 + tail call void @passCallee_int_16(<4 x i32> noundef %0) + ret void +} + +declare void @passCallee_int_16(<4 x i32> noundef) + +define void @loadAndPass_int_32() { +; CHECK-LABEL: loadAndPass_int_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: lgrl %r1, global_int_32@GOT +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vl %v1, 16(%r1), 3 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: vst %v1, 176(%r15), 3 +; CHECK-NEXT: vst %v0, 160(%r15), 3 +; CHECK-NEXT: brasl %r14, passCallee_int_32@PLT +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <8 x i32>, align 8 + %0 = load <8 x i32>, ptr @global_int_32, align 8 + store <8 x i32> %0, ptr %byval-temp, align 8 + call void @passCallee_int_32(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_int_32(ptr noundef dead_on_return) + +define void @loadAndPass_long_8() { +; CHECK-LABEL: loadAndPass_long_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_8@GOT +; CHECK-NEXT: vlrepg %v24, 0(%r1) +; CHECK-NEXT: jg passCallee_long_8@PLT +entry: + %0 = load <1 x i64>, ptr @global_long_8, align 8 + tail call void @passCallee_long_8(<1 x i64> noundef %0) + ret void +} + +declare void @passCallee_long_8(<1 x i64> noundef) + +define void @loadAndPass_long_16() { +; CHECK-LABEL: loadAndPass_long_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: jg passCallee_long_16@PLT +entry: + %0 = load <2 x i64>, ptr @global_long_16, align 8 + tail call void @passCallee_long_16(<2 x i64> noundef %0) + ret void +} + +declare void @passCallee_long_16(<2 x i64> noundef) + +define void @loadAndPass___int128_16() { +; CHECK-LABEL: loadAndPass___int128_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global___int128_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: jg passCallee___int128_16@PLT +entry: + %0 = load <1 x i128>, ptr @global___int128_16, align 8 + tail call void @passCallee___int128_16(<1 x i128> noundef %0) + ret void +} + +declare void @passCallee___int128_16(<1 x i128> noundef) + +define void @loadAndPass___int128_32() { +; CHECK-LABEL: loadAndPass___int128_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: lgrl %r1, global___int128_32@GOT +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vl %v1, 16(%r1), 3 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: vst %v1, 176(%r15), 3 +; CHECK-NEXT: vst %v0, 160(%r15), 3 +; CHECK-NEXT: brasl %r14, passCallee___int128_32@PLT +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <2 x i128>, align 8 + %0 = load <2 x i128>, ptr @global___int128_32, align 8 + store <2 x i128> %0, ptr %byval-temp, align 8 + call void @passCallee___int128_32(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee___int128_32(ptr noundef dead_on_return) + +define void @loadAndPass__Float16_2() { +; CHECK-LABEL: loadAndPass__Float16_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_2@GOT +; CHECK-NEXT: vlreph %v24, 0(%r1) +; CHECK-NEXT: jg passCallee__Float16_2@PLT +entry: + %0 = load <1 x half>, ptr @global__Float16_2, align 2 + tail call void @passCallee__Float16_2(<1 x half> noundef %0) + ret void +} + +declare void @passCallee__Float16_2(<1 x half> noundef) + +define void @loadAndPass__Float16_8() { +; CHECK-LABEL: loadAndPass__Float16_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_8@GOT +; CHECK-NEXT: vlreph %v0, 0(%r1) +; CHECK-NEXT: vlreph %v1, 2(%r1) +; CHECK-NEXT: vlreph %v2, 4(%r1) +; CHECK-NEXT: vlreph %v3, 6(%r1) +; CHECK-NEXT: vmrhh %v2, %v2, %v3 +; CHECK-NEXT: vmrhh %v0, %v0, %v1 +; CHECK-NEXT: vmrhf %v0, %v0, %v2 +; CHECK-NEXT: vmrhg %v24, %v0, %v0 +; CHECK-NEXT: jg passCallee__Float16_8@PLT +entry: + %0 = load <4 x half>, ptr @global__Float16_8, align 8 + tail call void @passCallee__Float16_8(<4 x half> noundef %0) + ret void +} + +declare void @passCallee__Float16_8(<4 x half> noundef) + +define void @loadAndPass__Float16_16() { +; CHECK-LABEL: loadAndPass__Float16_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global__Float16_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: jg passCallee__Float16_16@PLT +entry: + %0 = load <8 x half>, ptr @global__Float16_16, align 8 + tail call void @passCallee__Float16_16(<8 x half> noundef %0) + ret void +} + +declare void @passCallee__Float16_16(<8 x half> noundef) + +define void @loadAndPass__Float16_32() { +; CHECK-LABEL: loadAndPass__Float16_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: lgrl %r1, global__Float16_32@GOT +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vl %v1, 16(%r1), 3 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: vst %v1, 176(%r15), 3 +; CHECK-NEXT: vst %v0, 160(%r15), 3 +; CHECK-NEXT: brasl %r14, passCallee__Float16_32@PLT +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <16 x half>, align 8 + %0 = load <16 x half>, ptr @global__Float16_32, align 8 + store <16 x half> %0, ptr %byval-temp, align 8 + call void @passCallee__Float16_32(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee__Float16_32(ptr noundef dead_on_return) + +define void @loadAndPass_float_4() { +; CHECK-LABEL: loadAndPass_float_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_float_4@GOT +; CHECK-NEXT: vlrepf %v24, 0(%r1) +; CHECK-NEXT: jg passCallee_float_4@PLT +entry: + %0 = load <1 x float>, ptr @global_float_4, align 4 + tail call void @passCallee_float_4(<1 x float> noundef %0) + ret void +} + +declare void @passCallee_float_4(<1 x float> noundef) + +define void @loadAndPass_float_8() { +; CHECK-LABEL: loadAndPass_float_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_float_8@GOT +; CHECK-NEXT: vlrepg %v24, 0(%r1) +; CHECK-NEXT: jg passCallee_float_8@PLT +entry: + %0 = load <2 x float>, ptr @global_float_8, align 8 + tail call void @passCallee_float_8(<2 x float> noundef %0) + ret void +} + +declare void @passCallee_float_8(<2 x float> noundef) + +define void @loadAndPass_float_16() { +; CHECK-LABEL: loadAndPass_float_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_float_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: jg passCallee_float_16@PLT +entry: + %0 = load <4 x float>, ptr @global_float_16, align 8 + tail call void @passCallee_float_16(<4 x float> noundef %0) + ret void +} + +declare void @passCallee_float_16(<4 x float> noundef) + +define void @loadAndPass_double_8() { +; CHECK-LABEL: loadAndPass_double_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_double_8@GOT +; CHECK-NEXT: vlrepg %v24, 0(%r1) +; CHECK-NEXT: jg passCallee_double_8@PLT +entry: + %0 = load <1 x double>, ptr @global_double_8, align 8 + tail call void @passCallee_double_8(<1 x double> noundef %0) + ret void +} + +declare void @passCallee_double_8(<1 x double> noundef) + +define void @loadAndPass_double_16() { +; CHECK-LABEL: loadAndPass_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_double_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: jg passCallee_double_16@PLT +entry: + %0 = load <2 x double>, ptr @global_double_16, align 8 + tail call void @passCallee_double_16(<2 x double> noundef %0) + ret void +} + +declare void @passCallee_double_16(<2 x double> noundef) + +define void @loadAndPass_double_32() { +; CHECK-LABEL: loadAndPass_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: lgrl %r1, global_double_32@GOT +; CHECK-NEXT: vl %v0, 0(%r1), 3 +; CHECK-NEXT: vl %v1, 16(%r1), 3 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: vst %v1, 176(%r15), 3 +; CHECK-NEXT: vst %v0, 160(%r15), 3 +; CHECK-NEXT: brasl %r14, passCallee_double_32@PLT +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <4 x double>, align 8 + %0 = load <4 x double>, ptr @global_double_32, align 8 + store <4 x double> %0, ptr %byval-temp, align 8 + call void @passCallee_double_32(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_double_32(ptr noundef dead_on_return) + +define void @loadAndPass_long_double_16() { +; CHECK-LABEL: loadAndPass_long_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: lgrl %r1, global_long_double_16@GOT +; CHECK-NEXT: vl %v24, 0(%r1), 3 +; CHECK-NEXT: jg passCallee_long_double_16@PLT +entry: + %0 = load <1 x fp128>, ptr @global_long_double_16, align 8 + tail call void @passCallee_long_double_16(<1 x fp128> noundef %0) + ret void +} + +declare void @passCallee_long_double_16(<1 x fp128> noundef) + +define void @loadAndPass_long_double_32() { +; CHECK-LABEL: loadAndPass_long_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: lgrl %r1, global_long_double_32@GOT +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: mvc 176(16,%r15), 16(%r1) +; CHECK-NEXT: mvc 160(16,%r15), 0(%r1) +; CHECK-NEXT: brasl %r14, passCallee_long_double_32@PLT +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +entry: + %byval-temp = alloca <2 x fp128>, align 8 + %0 = load <2 x fp128>, ptr @global_long_double_32, align 8 + store <2 x fp128> %0, ptr %byval-temp, align 8 + call void @passCallee_long_double_32(ptr noundef nonnull dead_on_return %byval-temp) + ret void +} + +declare void @passCallee_long_double_32(ptr noundef dead_on_return) + +define void @receiveAndStore_char_1() { +; CHECK-LABEL: receiveAndStore_char_1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_char_1@PLT +; CHECK-NEXT: lgrl %r1, global_char_1@GOT +; CHECK-NEXT: vsteb %v24, 0(%r1), 0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <1 x i8> @retCallee_char_1() + store <1 x i8> %call, ptr @global_char_1, align 2 + ret void +} + +declare <1 x i8> @retCallee_char_1() + +define void @receiveAndStore_char_8() { +; CHECK-LABEL: receiveAndStore_char_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_char_8@PLT +; CHECK-NEXT: lgrl %r1, global_char_8@GOT +; CHECK-NEXT: vsteg %v24, 0(%r1), 0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <8 x i8> @retCallee_char_8() + store <8 x i8> %call, ptr @global_char_8, align 8 + ret void +} + +declare <8 x i8> @retCallee_char_8() + +define void @receiveAndStore_char_16() { +; CHECK-LABEL: receiveAndStore_char_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_char_16@PLT +; CHECK-NEXT: lgrl %r1, global_char_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <16 x i8> @retCallee_char_16() + store <16 x i8> %call, ptr @global_char_16, align 8 + ret void +} + +declare <16 x i8> @retCallee_char_16() + +define void @receiveAndStore_char_32() { +; CHECK-LABEL: receiveAndStore_char_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, retCallee_char_32@PLT +; CHECK-NEXT: vl %v0, 160(%r15), 3 +; CHECK-NEXT: vl %v1, 176(%r15), 3 +; CHECK-NEXT: lgrl %r1, global_char_32@GOT +; CHECK-NEXT: vst %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <32 x i8>, align 8 + call void @retCallee_char_32(ptr dead_on_unwind nonnull writable sret(<32 x i8>) align 8 %tmp) + %0 = load <32 x i8>, ptr %tmp, align 8 + store <32 x i8> %0, ptr @global_char_32, align 8 + ret void +} + +declare void @retCallee_char_32(ptr dead_on_unwind writable sret(<32 x i8>) align 8) + +define void @receiveAndStore_short_2() { +; CHECK-LABEL: receiveAndStore_short_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_short_2@PLT +; CHECK-NEXT: lgrl %r1, global_short_2@GOT +; CHECK-NEXT: vsteh %v24, 0(%r1), 0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <1 x i16> @retCallee_short_2() + store <1 x i16> %call, ptr @global_short_2, align 2 + ret void +} + +declare <1 x i16> @retCallee_short_2() + +define void @receiveAndStore_short_8() { +; CHECK-LABEL: receiveAndStore_short_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_short_8@PLT +; CHECK-NEXT: lgrl %r1, global_short_8@GOT +; CHECK-NEXT: vsteg %v24, 0(%r1), 0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <4 x i16> @retCallee_short_8() + store <4 x i16> %call, ptr @global_short_8, align 8 + ret void +} + +declare <4 x i16> @retCallee_short_8() + +define void @receiveAndStore_short_16() { +; CHECK-LABEL: receiveAndStore_short_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_short_16@PLT +; CHECK-NEXT: lgrl %r1, global_short_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <8 x i16> @retCallee_short_16() + store <8 x i16> %call, ptr @global_short_16, align 8 + ret void +} + +declare <8 x i16> @retCallee_short_16() + +define void @receiveAndStore_int_4() { +; CHECK-LABEL: receiveAndStore_int_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_int_4@PLT +; CHECK-NEXT: lgrl %r1, global_int_4@GOT +; CHECK-NEXT: vstef %v24, 0(%r1), 0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <1 x i32> @retCallee_int_4() + store <1 x i32> %call, ptr @global_int_4, align 4 + ret void +} + +declare <1 x i32> @retCallee_int_4() + +define void @receiveAndStore_int_8() { +; CHECK-LABEL: receiveAndStore_int_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_int_8@PLT +; CHECK-NEXT: lgrl %r1, global_int_8@GOT +; CHECK-NEXT: vsteg %v24, 0(%r1), 0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <2 x i32> @retCallee_int_8() + store <2 x i32> %call, ptr @global_int_8, align 8 + ret void +} + +declare <2 x i32> @retCallee_int_8() + +define void @receiveAndStore_int_16() { +; CHECK-LABEL: receiveAndStore_int_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_int_16@PLT +; CHECK-NEXT: lgrl %r1, global_int_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <4 x i32> @retCallee_int_16() + store <4 x i32> %call, ptr @global_int_16, align 8 + ret void +} + +declare <4 x i32> @retCallee_int_16() + +define void @receiveAndStore_int_32() { +; CHECK-LABEL: receiveAndStore_int_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, retCallee_int_32@PLT +; CHECK-NEXT: vl %v0, 160(%r15), 3 +; CHECK-NEXT: vl %v1, 176(%r15), 3 +; CHECK-NEXT: lgrl %r1, global_int_32@GOT +; CHECK-NEXT: vst %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <8 x i32>, align 8 + call void @retCallee_int_32(ptr dead_on_unwind nonnull writable sret(<8 x i32>) align 8 %tmp) + %0 = load <8 x i32>, ptr %tmp, align 8 + store <8 x i32> %0, ptr @global_int_32, align 8 + ret void +} + +declare void @retCallee_int_32(ptr dead_on_unwind writable sret(<8 x i32>) align 8) + +define void @receiveAndStore_long_8() { +; CHECK-LABEL: receiveAndStore_long_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_long_8@PLT +; CHECK-NEXT: lgrl %r1, global_long_8@GOT +; CHECK-NEXT: vsteg %v24, 0(%r1), 0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <1 x i64> @retCallee_long_8() + store <1 x i64> %call, ptr @global_long_8, align 8 + ret void +} + +declare <1 x i64> @retCallee_long_8() + +define void @receiveAndStore_long_16() { +; CHECK-LABEL: receiveAndStore_long_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_long_16@PLT +; CHECK-NEXT: lgrl %r1, global_long_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <2 x i64> @retCallee_long_16() + store <2 x i64> %call, ptr @global_long_16, align 8 + ret void +} + +declare <2 x i64> @retCallee_long_16() + +define void @receiveAndStore___int128_16() { +; CHECK-LABEL: receiveAndStore___int128_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee___int128_16@PLT +; CHECK-NEXT: lgrl %r1, global___int128_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <1 x i128> @retCallee___int128_16() + store <1 x i128> %call, ptr @global___int128_16, align 8 + ret void +} + +declare <1 x i128> @retCallee___int128_16() + +define void @receiveAndStore___int128_32() { +; CHECK-LABEL: receiveAndStore___int128_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, retCallee___int128_32@PLT +; CHECK-NEXT: vl %v0, 160(%r15), 3 +; CHECK-NEXT: vl %v1, 176(%r15), 3 +; CHECK-NEXT: lgrl %r1, global___int128_32@GOT +; CHECK-NEXT: vst %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <2 x i128>, align 8 + call void @retCallee___int128_32(ptr dead_on_unwind nonnull writable sret(<2 x i128>) align 8 %tmp) + %0 = load <2 x i128>, ptr %tmp, align 8 + store <2 x i128> %0, ptr @global___int128_32, align 8 + ret void +} + +declare void @retCallee___int128_32(ptr dead_on_unwind writable sret(<2 x i128>) align 8) + +define void @receiveAndStore__Float16_2() { +; CHECK-LABEL: receiveAndStore__Float16_2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee__Float16_2@PLT +; CHECK-NEXT: lgrl %r1, global__Float16_2@GOT +; CHECK-NEXT: vsteh %v24, 0(%r1), 0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <1 x half> @retCallee__Float16_2() + store <1 x half> %call, ptr @global__Float16_2, align 2 + ret void +} + +declare <1 x half> @retCallee__Float16_2() + +define void @receiveAndStore__Float16_8() { +; CHECK-LABEL: receiveAndStore__Float16_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee__Float16_8@PLT +; CHECK-NEXT: lgrl %r1, global__Float16_8@GOT +; CHECK-NEXT: vsteh %v24, 6(%r1), 3 +; CHECK-NEXT: vsteh %v24, 4(%r1), 2 +; CHECK-NEXT: vsteh %v24, 2(%r1), 1 +; CHECK-NEXT: vsteh %v24, 0(%r1), 0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <4 x half> @retCallee__Float16_8() + store <4 x half> %call, ptr @global__Float16_8, align 8 + ret void +} + +declare <4 x half> @retCallee__Float16_8() + +define void @receiveAndStore__Float16_16() { +; CHECK-LABEL: receiveAndStore__Float16_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee__Float16_16@PLT +; CHECK-NEXT: lgrl %r1, global__Float16_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <8 x half> @retCallee__Float16_16() + store <8 x half> %call, ptr @global__Float16_16, align 8 + ret void +} + +declare <8 x half> @retCallee__Float16_16() + +define void @receiveAndStore__Float16_32() { +; CHECK-LABEL: receiveAndStore__Float16_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, retCallee__Float16_32@PLT +; CHECK-NEXT: vl %v0, 160(%r15), 3 +; CHECK-NEXT: vl %v1, 176(%r15), 3 +; CHECK-NEXT: lgrl %r1, global__Float16_32@GOT +; CHECK-NEXT: vst %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <16 x half>, align 8 + call void @retCallee__Float16_32(ptr dead_on_unwind nonnull writable sret(<16 x half>) align 8 %tmp) + %0 = load <16 x half>, ptr %tmp, align 8 + store <16 x half> %0, ptr @global__Float16_32, align 8 + ret void +} + +declare void @retCallee__Float16_32(ptr dead_on_unwind writable sret(<16 x half>) align 8) + +define void @receiveAndStore_float_4() { +; CHECK-LABEL: receiveAndStore_float_4: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_float_4@PLT +; CHECK-NEXT: lgrl %r1, global_float_4@GOT +; CHECK-NEXT: vstef %v24, 0(%r1), 0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <1 x float> @retCallee_float_4() + store <1 x float> %call, ptr @global_float_4, align 4 + ret void +} + +declare <1 x float> @retCallee_float_4() + +define void @receiveAndStore_float_8() { +; CHECK-LABEL: receiveAndStore_float_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_float_8@PLT +; CHECK-NEXT: lgrl %r1, global_float_8@GOT +; CHECK-NEXT: vsteg %v24, 0(%r1), 0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <2 x float> @retCallee_float_8() + store <2 x float> %call, ptr @global_float_8, align 8 + ret void +} + +declare <2 x float> @retCallee_float_8() + +define void @receiveAndStore_float_16() { +; CHECK-LABEL: receiveAndStore_float_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_float_16@PLT +; CHECK-NEXT: lgrl %r1, global_float_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <4 x float> @retCallee_float_16() + store <4 x float> %call, ptr @global_float_16, align 8 + ret void +} + +declare <4 x float> @retCallee_float_16() + +define void @receiveAndStore_double_8() { +; CHECK-LABEL: receiveAndStore_double_8: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_double_8@PLT +; CHECK-NEXT: lgrl %r1, global_double_8@GOT +; CHECK-NEXT: vsteg %v24, 0(%r1), 0 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <1 x double> @retCallee_double_8() + store <1 x double> %call, ptr @global_double_8, align 8 + ret void +} + +declare <1 x double> @retCallee_double_8() + +define void @receiveAndStore_double_16() { +; CHECK-LABEL: receiveAndStore_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_double_16@PLT +; CHECK-NEXT: lgrl %r1, global_double_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <2 x double> @retCallee_double_16() + store <2 x double> %call, ptr @global_double_16, align 8 + ret void +} + +declare <2 x double> @retCallee_double_16() + +define void @receiveAndStore_double_32() { +; CHECK-LABEL: receiveAndStore_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, retCallee_double_32@PLT +; CHECK-NEXT: vl %v0, 160(%r15), 3 +; CHECK-NEXT: vl %v1, 176(%r15), 3 +; CHECK-NEXT: lgrl %r1, global_double_32@GOT +; CHECK-NEXT: vst %v1, 16(%r1), 3 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <4 x double>, align 8 + call void @retCallee_double_32(ptr dead_on_unwind nonnull writable sret(<4 x double>) align 8 %tmp) + %0 = load <4 x double>, ptr %tmp, align 8 + store <4 x double> %0, ptr @global_double_32, align 8 + ret void +} + +declare void @retCallee_double_32(ptr dead_on_unwind writable sret(<4 x double>) align 8) + +define void @receiveAndStore_long_double_16() { +; CHECK-LABEL: receiveAndStore_long_double_16: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -160 +; CHECK-NEXT: .cfi_def_cfa_offset 320 +; CHECK-NEXT: brasl %r14, retCallee_long_double_16@PLT +; CHECK-NEXT: lgrl %r1, global_long_double_16@GOT +; CHECK-NEXT: vst %v24, 0(%r1), 3 +; CHECK-NEXT: lmg %r14, %r15, 272(%r15) +; CHECK-NEXT: br %r14 +entry: + %call = tail call <1 x fp128> @retCallee_long_double_16() + store <1 x fp128> %call, ptr @global_long_double_16, align 8 + ret void +} + +declare <1 x fp128> @retCallee_long_double_16() + +define void @receiveAndStore_long_double_32() { +; CHECK-LABEL: receiveAndStore_long_double_32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: la %r2, 160(%r15) +; CHECK-NEXT: brasl %r14, retCallee_long_double_32@PLT +; CHECK-NEXT: lgrl %r1, global_long_double_32@GOT +; CHECK-NEXT: mvc 16(16,%r1), 176(%r15) +; CHECK-NEXT: mvc 0(16,%r1), 160(%r15) +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 +entry: + %tmp = alloca <2 x fp128>, align 8 + call void @retCallee_long_double_32(ptr dead_on_unwind nonnull writable sret(<2 x fp128>) align 8 %tmp) + %0 = load <2 x fp128>, ptr %tmp, align 8 + store <2 x fp128> %0, ptr @global_long_double_32, align 8 + ret void +} + +declare void @retCallee_long_double_32(ptr dead_on_unwind writable sret(<2 x fp128>) align 8) diff --git a/llvm/test/CodeGen/SystemZ/vec-abi-03.ll b/llvm/test/CodeGen/SystemZ/vec-abi-03.ll new file mode 100644 index 0000000000000..6a44fe7ef6696 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-abi-03.ll @@ -0,0 +1,53 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s +; +; Test fp16-vector stack arguments. + +define void @bar(<8 x half>, <8 x half>, <8 x half>, <8 x half>, +; CHECK-LABEL: bar: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 176(%r15), 3 +; CHECK-NEXT: lg %r0, 168(%r15) +; CHECK-NEXT: vlrepg %v1, 160(%r15) +; CHECK-NEXT: vsteh %v1, 0(%r2), 0 +; CHECK-NEXT: stg %r0, 0(%r3) +; CHECK-NEXT: vst %v0, 0(%r4), 3 +; CHECK-NEXT: br %r14 + <8 x half>, <8 x half>, <8 x half>, <8 x half>, + <1 x half> %A, <4 x half> %B, <8 x half> %C, + ptr %Dst0, ptr %Dst1, ptr %Dst2) { + store <1 x half> %A, ptr %Dst0 + store <4 x half> %B, ptr %Dst1 + store <8 x half> %C, ptr %Dst2 + ret void +} + +define void @foo(<1 x half> %A, <4 x half> %B, <8 x half> %C) { +; CHECK-LABEL: foo: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r14, %r15, 112(%r15) +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: aghi %r15, -192 +; CHECK-NEXT: .cfi_def_cfa_offset 352 +; CHECK-NEXT: vst %v28, 176(%r15), 3 +; CHECK-NEXT: vmrhg %v0, %v24, %v26 +; CHECK-NEXT: vgbm %v24, 0 +; CHECK-NEXT: vgbm %v26, 0 +; CHECK-NEXT: vgbm %v28, 0 +; CHECK-NEXT: vgbm %v30, 0 +; CHECK-NEXT: vgbm %v25, 0 +; CHECK-NEXT: vgbm %v27, 0 +; CHECK-NEXT: vgbm %v29, 0 +; CHECK-NEXT: vgbm %v31, 0 +; CHECK-NEXT: vst %v0, 160(%r15), 3 +; CHECK-NEXT: brasl %r14, bar@PLT +; CHECK-NEXT: lmg %r14, %r15, 304(%r15) +; CHECK-NEXT: br %r14 + call void @bar(<8 x half> zeroinitializer, <8 x half> zeroinitializer, + <8 x half> zeroinitializer, <8 x half> zeroinitializer, + <8 x half> zeroinitializer, <8 x half> zeroinitializer, + <8 x half> zeroinitializer, <8 x half> zeroinitializer, + <1 x half> %A, <4 x half> %B, <8 x half> %C) + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/vec-abi-04.ll b/llvm/test/CodeGen/SystemZ/vec-abi-04.ll new file mode 100644 index 0000000000000..3e03e2a5d4621 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-abi-04.ll @@ -0,0 +1,349 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s +; +; Test handling of fp16 IR vector arguments for z10 (without vector support). + +define <1 x half> @pass_half_1(<1 x half> %Dummy, <1 x half> %Arg) { +; CHECK-LABEL: pass_half_1: +; CHECK: # %bb.0: +; CHECK-NEXT: ler %f0, %f2 +; CHECK-NEXT: br %r14 + ret <1 x half> %Arg +} + +define <4 x half> @pass_half_4(<1 x half> %Dummy, <4 x half> %Arg) { +; CHECK-LABEL: pass_half_4: +; CHECK: # %bb.0: +; CHECK-NEXT: lgh %r0, 166(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f1, %r0 +; CHECK-NEXT: ler %f0, %f2 +; CHECK-NEXT: ler %f2, %f4 +; CHECK-NEXT: ler %f4, %f6 +; CHECK-NEXT: ler %f6, %f1 +; CHECK-NEXT: br %r14 + ret <4 x half> %Arg +} + +define <8 x half> @pass_half_8(<1 x half> %Dummy, <8 x half> %Arg) { +; CHECK-LABEL: pass_half_8: +; CHECK: # %bb.0: +; CHECK-NEXT: lgh %r0, 166(%r15) +; CHECK-NEXT: # kill: def $f6h killed $f6h def $f6d +; CHECK-NEXT: # kill: def $f4h killed $f4h def $f4d +; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d +; CHECK-NEXT: lgh %r1, 174(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgh %r0, 182(%r15) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f1, %r1 +; CHECK-NEXT: lgh %r1, 190(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgh %r3, 198(%r15) +; CHECK-NEXT: ldgr %f3, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f5, %r0 +; CHECK-NEXT: sllg %r0, %r3, 48 +; CHECK-NEXT: ldgr %f7, %r0 +; CHECK-NEXT: lgdr %r0, %f6 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r2) +; CHECK-NEXT: lgdr %r0, %f4 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r2) +; CHECK-NEXT: lgdr %r0, %f2 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r2) +; CHECK-NEXT: lgdr %r0, %f7 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 14(%r2) +; CHECK-NEXT: lgdr %r0, %f5 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 12(%r2) +; CHECK-NEXT: lgdr %r0, %f3 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 10(%r2) +; CHECK-NEXT: lgdr %r0, %f1 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 8(%r2) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r2) +; CHECK-NEXT: br %r14 + ret <8 x half> %Arg +} + +define <16 x half> @pass_half_16(<1 x half> %Dummy, <16 x half> %Arg) { +; CHECK-LABEL: pass_half_16: +; CHECK: # %bb.0: +; CHECK-NEXT: aghi %r15, -64 +; CHECK-NEXT: .cfi_def_cfa_offset 224 +; CHECK-NEXT: std %f8, 56(%r15) # 8-byte Spill +; CHECK-NEXT: std %f9, 48(%r15) # 8-byte Spill +; CHECK-NEXT: std %f10, 40(%r15) # 8-byte Spill +; CHECK-NEXT: std %f11, 32(%r15) # 8-byte Spill +; CHECK-NEXT: std %f12, 24(%r15) # 8-byte Spill +; CHECK-NEXT: std %f13, 16(%r15) # 8-byte Spill +; CHECK-NEXT: std %f14, 8(%r15) # 8-byte Spill +; CHECK-NEXT: std %f15, 0(%r15) # 8-byte Spill +; CHECK-NEXT: .cfi_offset %f8, -168 +; CHECK-NEXT: .cfi_offset %f9, -176 +; CHECK-NEXT: .cfi_offset %f10, -184 +; CHECK-NEXT: .cfi_offset %f11, -192 +; CHECK-NEXT: .cfi_offset %f12, -200 +; CHECK-NEXT: .cfi_offset %f13, -208 +; CHECK-NEXT: .cfi_offset %f14, -216 +; CHECK-NEXT: .cfi_offset %f15, -224 +; CHECK-NEXT: lgh %r0, 230(%r15) +; CHECK-NEXT: # kill: def $f6h killed $f6h def $f6d +; CHECK-NEXT: # kill: def $f4h killed $f4h def $f4d +; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d +; CHECK-NEXT: lgh %r1, 238(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgh %r0, 246(%r15) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f1, %r1 +; CHECK-NEXT: lgh %r1, 254(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f3, %r0 +; CHECK-NEXT: lgh %r0, 262(%r15) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f5, %r1 +; CHECK-NEXT: lgh %r1, 270(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f7, %r0 +; CHECK-NEXT: lgh %r0, 278(%r15) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f8, %r1 +; CHECK-NEXT: lgh %r1, 286(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f9, %r0 +; CHECK-NEXT: lgh %r0, 294(%r15) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f10, %r1 +; CHECK-NEXT: lgh %r1, 302(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: ldgr %f11, %r0 +; CHECK-NEXT: lgh %r0, 310(%r15) +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: ldgr %f12, %r1 +; CHECK-NEXT: lgh %r1, 318(%r15) +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgh %r3, 326(%r15) +; CHECK-NEXT: ldgr %f13, %r0 +; CHECK-NEXT: sllg %r0, %r1, 48 +; CHECK-NEXT: ldgr %f14, %r0 +; CHECK-NEXT: sllg %r0, %r3, 48 +; CHECK-NEXT: ldgr %f15, %r0 +; CHECK-NEXT: lgdr %r0, %f6 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 4(%r2) +; CHECK-NEXT: lgdr %r0, %f4 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 2(%r2) +; CHECK-NEXT: lgdr %r0, %f2 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 0(%r2) +; CHECK-NEXT: lgdr %r0, %f15 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 30(%r2) +; CHECK-NEXT: lgdr %r0, %f14 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 28(%r2) +; CHECK-NEXT: lgdr %r0, %f13 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 26(%r2) +; CHECK-NEXT: lgdr %r0, %f12 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 24(%r2) +; CHECK-NEXT: lgdr %r0, %f11 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 22(%r2) +; CHECK-NEXT: lgdr %r0, %f10 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 20(%r2) +; CHECK-NEXT: lgdr %r0, %f9 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 18(%r2) +; CHECK-NEXT: lgdr %r0, %f8 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 16(%r2) +; CHECK-NEXT: lgdr %r0, %f7 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 14(%r2) +; CHECK-NEXT: lgdr %r0, %f5 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 12(%r2) +; CHECK-NEXT: lgdr %r0, %f3 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 10(%r2) +; CHECK-NEXT: lgdr %r0, %f1 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 8(%r2) +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sth %r0, 6(%r2) +; CHECK-NEXT: ld %f8, 56(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f9, 48(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f10, 40(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f11, 32(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f12, 24(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f13, 16(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f14, 8(%r15) # 8-byte Reload +; CHECK-NEXT: ld %f15, 0(%r15) # 8-byte Reload +; CHECK-NEXT: aghi %r15, 64 +; CHECK-NEXT: br %r14 + ret <16 x half> %Arg +} + +define <24 x half> @pass_half_24(<1 x half> %Dummy, <24 x half> %Arg) { +; CHECK-LABEL: pass_half_24: +; CHECK: # %bb.0: +; CHECK-NEXT: stmg %r7, %r15, 56(%r15) +; CHECK-NEXT: .cfi_offset %r7, -104 +; CHECK-NEXT: .cfi_offset %r8, -96 +; CHECK-NEXT: .cfi_offset %r9, -88 +; CHECK-NEXT: .cfi_offset %r10, -80 +; CHECK-NEXT: .cfi_offset %r11, -72 +; CHECK-NEXT: .cfi_offset %r12, -64 +; CHECK-NEXT: .cfi_offset %r13, -56 +; CHECK-NEXT: .cfi_offset %r14, -48 +; CHECK-NEXT: .cfi_offset %r15, -40 +; CHECK-NEXT: lgh %r0, 198(%r15) +; CHECK-NEXT: # kill: def $f6h killed $f6h def $f6d +; CHECK-NEXT: # kill: def $f4h killed $f4h def $f4d +; CHECK-NEXT: # kill: def $f2h killed $f2h def $f2d +; CHECK-NEXT: sllg %r0, %r0, 48 +; CHECK-NEXT: lgh %r1, 190(%r15) +; CHECK-NEXT: ldgr %f0, %r0 +; CHECK-NEXT: lgdr %r0, %f0 +; CHECK-NEXT: srlg %r0, %r0, 48 +; CHECK-NEXT: sllg %r1, %r1, 48 +; CHECK-NEXT: lgh %r3, 182(%r15) +; CHECK-NEXT: ldgr %f0, %r1 +; CHECK-NEXT: lgdr %r1, %f0 +; CHECK-NEXT: rosbg %r0, %r1, 32, 47, 32 +; CHECK-NEXT: sllg %r1, %r3, 48 +; CHECK-NEXT: lgh %r3, 174(%r15) +; CHECK-NEXT: ldgr %f0, %r1 +; CHECK-NEXT: lgdr %r1, %f0 +; CHECK-NEXT: srlg %r1, %r1, 48 +; CHECK-NEXT: sllg %r3, %r3, 48 +; CHECK-NEXT: lgh %r4, 230(%r15) +; CHECK-NEXT: ldgr %f0, %r3 +; CHECK-NEXT: lgdr %r3, %f0 +; CHECK-NEXT: rosbg %r1, %r3, 32, 47, 32 +; CHECK-NEXT: sllg %r3, %r4, 48 +; CHECK-NEXT: lgh %r4, 222(%r15) +; CHECK-NEXT: ldgr %f0, %r3 +; CHECK-NEXT: lgdr %r3, %f0 +; CHECK-NEXT: srlg %r3, %r3, 48 +; CHECK-NEXT: sllg %r4, %r4, 48 +; CHECK-NEXT: lgh %r5, 214(%r15) +; CHECK-NEXT: ldgr %f0, %r4 +; CHECK-NEXT: lgdr %r4, %f0 +; CHECK-NEXT: rosbg %r3, %r4, 32, 47, 32 +; CHECK-NEXT: sllg %r4, %r5, 48 +; CHECK-NEXT: lgh %r5, 206(%r15) +; CHECK-NEXT: ldgr %f0, %r4 +; CHECK-NEXT: lgdr %r4, %f0 +; CHECK-NEXT: srlg %r4, %r4, 48 +; CHECK-NEXT: sllg %r5, %r5, 48 +; CHECK-NEXT: lgh %r14, 262(%r15) +; CHECK-NEXT: ldgr %f0, %r5 +; CHECK-NEXT: lgdr %r5, %f0 +; CHECK-NEXT: rosbg %r4, %r5, 32, 47, 32 +; CHECK-NEXT: sllg %r5, %r14, 48 +; CHECK-NEXT: lgh %r14, 254(%r15) +; CHECK-NEXT: ldgr %f0, %r5 +; CHECK-NEXT: lgdr %r5, %f0 +; CHECK-NEXT: srlg %r5, %r5, 48 +; CHECK-NEXT: sllg %r14, %r14, 48 +; CHECK-NEXT: lgh %r13, 246(%r15) +; CHECK-NEXT: ldgr %f0, %r14 +; CHECK-NEXT: lgdr %r14, %f0 +; CHECK-NEXT: rosbg %r5, %r14, 32, 47, 32 +; CHECK-NEXT: sllg %r14, %r13, 48 +; CHECK-NEXT: lgh %r13, 238(%r15) +; CHECK-NEXT: ldgr %f0, %r14 +; CHECK-NEXT: lgdr %r14, %f0 +; CHECK-NEXT: srlg %r14, %r14, 48 +; CHECK-NEXT: sllg %r13, %r13, 48 +; CHECK-NEXT: lgh %r12, 294(%r15) +; CHECK-NEXT: ldgr %f0, %r13 +; CHECK-NEXT: lgdr %r13, %f0 +; CHECK-NEXT: rosbg %r14, %r13, 32, 47, 32 +; CHECK-NEXT: sllg %r13, %r12, 48 +; CHECK-NEXT: lgh %r12, 286(%r15) +; CHECK-NEXT: ldgr %f0, %r13 +; CHECK-NEXT: lgdr %r13, %f0 +; CHECK-NEXT: srlg %r13, %r13, 48 +; CHECK-NEXT: sllg %r12, %r12, 48 +; CHECK-NEXT: lgh %r11, 278(%r15) +; CHECK-NEXT: ldgr %f0, %r12 +; CHECK-NEXT: lgdr %r12, %f0 +; CHECK-NEXT: rosbg %r13, %r12, 32, 47, 32 +; CHECK-NEXT: sllg %r12, %r11, 48 +; CHECK-NEXT: lgh %r11, 270(%r15) +; CHECK-NEXT: ldgr %f0, %r12 +; CHECK-NEXT: lgdr %r12, %f0 +; CHECK-NEXT: srlg %r12, %r12, 48 +; CHECK-NEXT: sllg %r11, %r11, 48 +; CHECK-NEXT: lgh %r10, 326(%r15) +; CHECK-NEXT: ldgr %f0, %r11 +; CHECK-NEXT: lgdr %r11, %f0 +; CHECK-NEXT: rosbg %r12, %r11, 32, 47, 32 +; CHECK-NEXT: sllg %r11, %r10, 48 +; CHECK-NEXT: lgh %r10, 318(%r15) +; CHECK-NEXT: ldgr %f0, %r11 +; CHECK-NEXT: lgdr %r11, %f0 +; CHECK-NEXT: srlg %r11, %r11, 48 +; CHECK-NEXT: sllg %r10, %r10, 48 +; CHECK-NEXT: lgh %r9, 310(%r15) +; CHECK-NEXT: ldgr %f0, %r10 +; CHECK-NEXT: lgdr %r10, %f0 +; CHECK-NEXT: rosbg %r11, %r10, 32, 47, 32 +; CHECK-NEXT: sllg %r10, %r9, 48 +; CHECK-NEXT: lgh %r9, 302(%r15) +; CHECK-NEXT: ldgr %f0, %r10 +; CHECK-NEXT: lgdr %r10, %f0 +; CHECK-NEXT: srlg %r10, %r10, 48 +; CHECK-NEXT: sllg %r9, %r9, 48 +; CHECK-NEXT: ldgr %f0, %r9 +; CHECK-NEXT: lgdr %r9, %f0 +; CHECK-NEXT: rosbg %r10, %r9, 32, 47, 32 +; CHECK-NEXT: lgdr %r9, %f4 +; CHECK-NEXT: lgh %r8, 166(%r15) +; CHECK-NEXT: srlg %r9, %r9, 48 +; CHECK-NEXT: lgdr %r7, %f2 +; CHECK-NEXT: rosbg %r9, %r7, 32, 47, 32 +; CHECK-NEXT: sllg %r8, %r8, 48 +; CHECK-NEXT: ldgr %f0, %r8 +; CHECK-NEXT: lgdr %r8, %f0 +; CHECK-NEXT: srlg %r8, %r8, 48 +; CHECK-NEXT: lgdr %r7, %f6 +; CHECK-NEXT: rosbg %r8, %r7, 32, 47, 32 +; CHECK-NEXT: sllg %r1, %r1, 32 +; CHECK-NEXT: sllg %r4, %r4, 32 +; CHECK-NEXT: sllg %r14, %r14, 32 +; CHECK-NEXT: sllg %r12, %r12, 32 +; CHECK-NEXT: sllg %r10, %r10, 32 +; CHECK-NEXT: sllg %r9, %r9, 32 +; CHECK-NEXT: lr %r1, %r0 +; CHECK-NEXT: lr %r4, %r3 +; CHECK-NEXT: lr %r14, %r5 +; CHECK-NEXT: lr %r12, %r13 +; CHECK-NEXT: lr %r10, %r11 +; CHECK-NEXT: lr %r9, %r8 +; CHECK-NEXT: stg %r9, 0(%r2) +; CHECK-NEXT: stg %r10, 40(%r2) +; CHECK-NEXT: stg %r12, 32(%r2) +; CHECK-NEXT: stg %r14, 24(%r2) +; CHECK-NEXT: stg %r4, 16(%r2) +; CHECK-NEXT: stg %r1, 8(%r2) +; CHECK-NEXT: lmg %r7, %r15, 56(%r15) +; CHECK-NEXT: br %r14 + ret <24 x half> %Arg +} diff --git a/llvm/test/CodeGen/SystemZ/vec-abi-05.ll b/llvm/test/CodeGen/SystemZ/vec-abi-05.ll new file mode 100644 index 0000000000000..60e0114edf600 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/vec-abi-05.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z16 | FileCheck %s +; +; Test handling of fp16 IR vector arguments for z16 (with vector support). + +define <1 x half> @pass_half_1(<1 x half> %Dummy, <1 x half> %Arg) { +; CHECK-LABEL: pass_half_1: +; CHECK: # %bb.0: +; CHECK-NEXT: vlr %v24, %v26 +; CHECK-NEXT: br %r14 + ret <1 x half> %Arg +} + +define <4 x half> @pass_half_4(<1 x half> %Dummy, <4 x half> %Arg) { +; CHECK-LABEL: pass_half_4: +; CHECK: # %bb.0: +; CHECK-NEXT: vlr %v24, %v26 +; CHECK-NEXT: br %r14 + ret <4 x half> %Arg +} + +define <8 x half> @pass_half_8(<1 x half> %Dummy, <8 x half> %Arg) { +; CHECK-LABEL: pass_half_8: +; CHECK: # %bb.0: +; CHECK-NEXT: vlr %v24, %v26 +; CHECK-NEXT: br %r14 + ret <8 x half> %Arg +} + +define <16 x half> @pass_half_16(<1 x half> %Dummy, <16 x half> %Arg) { +; CHECK-LABEL: pass_half_16: +; CHECK: # %bb.0: +; CHECK-NEXT: vlr %v24, %v26 +; CHECK-NEXT: vlr %v26, %v28 +; CHECK-NEXT: br %r14 + ret <16 x half> %Arg +} + +define <24 x half> @pass_half_24(<1 x half> %Dummy, <24 x half> %Arg) { +; CHECK-LABEL: pass_half_24: +; CHECK: # %bb.0: +; CHECK-NEXT: vlr %v24, %v26 +; CHECK-NEXT: vlr %v26, %v28 +; CHECK-NEXT: vlr %v28, %v30 +; CHECK-NEXT: br %r14 + ret <24 x half> %Arg +} + +define <72 x half> @pass_half_72(<72 x half> %Arg) { +; CHECK-LABEL: pass_half_72: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 160(%r15), 3 +; CHECK-NEXT: vst %v0, 128(%r2), 4 +; CHECK-NEXT: vst %v31, 112(%r2), 4 +; CHECK-NEXT: vst %v29, 96(%r2), 4 +; CHECK-NEXT: vst %v27, 80(%r2), 4 +; CHECK-NEXT: vst %v25, 64(%r2), 4 +; CHECK-NEXT: vst %v30, 48(%r2), 4 +; CHECK-NEXT: vst %v28, 32(%r2), 4 +; CHECK-NEXT: vst %v26, 16(%r2), 4 +; CHECK-NEXT: vst %v24, 0(%r2), 4 +; CHECK-NEXT: br %r14 + ret <72 x half> %Arg +} diff --git a/llvm/test/CodeGen/SystemZ/vec-cmp-09.ll b/llvm/test/CodeGen/SystemZ/vec-cmp-09.ll index cb8850e58c589..3ed6d200cd03c 100644 --- a/llvm/test/CodeGen/SystemZ/vec-cmp-09.ll +++ b/llvm/test/CodeGen/SystemZ/vec-cmp-09.ll @@ -166,8 +166,48 @@ define <2 x i64> @f16(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3) { ret <2 x i64> %ret } -define <4 x float> @f17(<4 x i32> %val1, <4 x float> %val2, <4 x float> %val3) { +define <8 x half> @f17(<8 x i16> %val1, <8 x half> %val2, <8 x half> %val3) { ; CHECK-LABEL: f17: +; CHECK: vblendh %v24, %v26, %v28, %v24 +; CHECK-NEXT: br %r14 + %cmp = icmp slt <8 x i16> %val1, zeroinitializer + %ret = select <8 x i1> %cmp, <8 x half> %val2, <8 x half> %val3 + ret <8 x half> %ret +} + +define <8 x half> @f18(<8 x i16> %val1, <8 x half> %val2, <8 x half> %val3) { +; CHECK-LABEL: f18: +; CHECK: vblendh %v24, %v28, %v26, %v24 +; CHECK-NEXT: br %r14 + %cmp = icmp sge <8 x i16> %val1, zeroinitializer + %ret = select <8 x i1> %cmp, <8 x half> %val2, <8 x half> %val3 + ret <8 x half> %ret +} + +define <8 x half> @f19(<8 x i16> %val1, <8 x half> %val2, <8 x half> %val3) { +; CHECK-LABEL: f19: +; CHECK: vblendh %v24, %v26, %v28, %v24 +; CHECK-NEXT: br %r14 + %mask = and <8 x i16> %val1, + %cmp = icmp ne <8 x i16> %mask, zeroinitializer + %ret = select <8 x i1> %cmp, <8 x half> %val2, <8 x half> %val3 + ret <8 x half> %ret +} + +define <8 x half> @f20(<8 x i16> %val1, <8 x half> %val2, <8 x half> %val3) { +; CHECK-LABEL: f20: +; CHECK: vblendh %v24, %v28, %v26, %v24 +; CHECK-NEXT: br %r14 + %mask = and <8 x i16> %val1, + %cmp = icmp eq <8 x i16> %mask, zeroinitializer + %ret = select <8 x i1> %cmp, <8 x half> %val2, <8 x half> %val3 + ret <8 x half> %ret +} + +define <4 x float> @f21(<4 x i32> %val1, <4 x float> %val2, <4 x float> %val3) { +; CHECK-LABEL: f21: ; CHECK: vblendf %v24, %v26, %v28, %v24 ; CHECK-NEXT: br %r14 %cmp = icmp slt <4 x i32> %val1, zeroinitializer @@ -175,8 +215,8 @@ define <4 x float> @f17(<4 x i32> %val1, <4 x float> %val2, <4 x float> %val3) { ret <4 x float> %ret } -define <4 x float> @f18(<4 x i32> %val1, <4 x float> %val2, <4 x float> %val3) { -; CHECK-LABEL: f18: +define <4 x float> @f22(<4 x i32> %val1, <4 x float> %val2, <4 x float> %val3) { +; CHECK-LABEL: f22: ; CHECK: vblendf %v24, %v28, %v26, %v24 ; CHECK-NEXT: br %r14 %cmp = icmp sge <4 x i32> %val1, zeroinitializer @@ -184,8 +224,8 @@ define <4 x float> @f18(<4 x i32> %val1, <4 x float> %val2, <4 x float> %val3) { ret <4 x float> %ret } -define <4 x float> @f19(<4 x i32> %val1, <4 x float> %val2, <4 x float> %val3) { -; CHECK-LABEL: f19: +define <4 x float> @f23(<4 x i32> %val1, <4 x float> %val2, <4 x float> %val3) { +; CHECK-LABEL: f23: ; CHECK: vblendf %v24, %v26, %v28, %v24 ; CHECK-NEXT: br %r14 %mask = and <4 x i32> %val1, @f19(<4 x i32> %val1, <4 x float> %val2, <4 x float> %val3) { ret <4 x float> %ret } -define <4 x float> @f20(<4 x i32> %val1, <4 x float> %val2, <4 x float> %val3) { -; CHECK-LABEL: f20: +define <4 x float> @f24(<4 x i32> %val1, <4 x float> %val2, <4 x float> %val3) { +; CHECK-LABEL: f24: ; CHECK: vblendf %v24, %v28, %v26, %v24 ; CHECK-NEXT: br %r14 %mask = and <4 x i32> %val1, @f20(<4 x i32> %val1, <4 x float> %val2, <4 x float> %val3) { ret <4 x float> %ret } -define <2 x double> @f21(<2 x i64> %val1, <2 x double> %val2, <2 x double> %val3) { -; CHECK-LABEL: f21: +define <2 x double> @f25(<2 x i64> %val1, <2 x double> %val2, <2 x double> %val3) { +; CHECK-LABEL: f25: ; CHECK: vblendg %v24, %v26, %v28, %v24 ; CHECK-NEXT: br %r14 %cmp = icmp slt <2 x i64> %val1, zeroinitializer @@ -215,8 +255,8 @@ define <2 x double> @f21(<2 x i64> %val1, <2 x double> %val2, <2 x double> %val3 ret <2 x double> %ret } -define <2 x double> @f22(<2 x i64> %val1, <2 x double> %val2, <2 x double> %val3) { -; CHECK-LABEL: f22: +define <2 x double> @f26(<2 x i64> %val1, <2 x double> %val2, <2 x double> %val3) { +; CHECK-LABEL: f26: ; CHECK: vblendg %v24, %v28, %v26, %v24 ; CHECK-NEXT: br %r14 %cmp = icmp sge <2 x i64> %val1, zeroinitializer @@ -224,8 +264,8 @@ define <2 x double> @f22(<2 x i64> %val1, <2 x double> %val2, <2 x double> %val3 ret <2 x double> %ret } -define <2 x double> @f23(<2 x i64> %val1, <2 x double> %val2, <2 x double> %val3) { -; CHECK-LABEL: f23: +define <2 x double> @f27(<2 x i64> %val1, <2 x double> %val2, <2 x double> %val3) { +; CHECK-LABEL: f27: ; CHECK: vblendg %v24, %v26, %v28, %v24 ; CHECK-NEXT: br %r14 %mask = and <2 x i64> %val1, @f23(<2 x i64> %val1, <2 x double> %val2, <2 x double> %val3 ret <2 x double> %ret } -define <2 x double> @f24(<2 x i64> %val1, <2 x double> %val2, <2 x double> %val3) { -; CHECK-LABEL: f24: +define <2 x double> @f28(<2 x i64> %val1, <2 x double> %val2, <2 x double> %val3) { +; CHECK-LABEL: f28: ; CHECK: vblendg %v24, %v28, %v26, %v24 ; CHECK-NEXT: br %r14 %mask = and <2 x i64> %val1, @f24(<2 x i64> %val1, <2 x double> %val2, <2 x double> %val3 %ret = select <2 x i1> %cmp, <2 x double> %val2, <2 x double> %val3 ret <2 x double> %ret } - diff --git a/llvm/test/CodeGen/SystemZ/vec-eswap-01.ll b/llvm/test/CodeGen/SystemZ/vec-eswap-01.ll index a1eb6b3990ff6..9baf67a75920c 100644 --- a/llvm/test/CodeGen/SystemZ/vec-eswap-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-eswap-01.ll @@ -50,9 +50,21 @@ define <2 x i64> @f4(ptr %ptr) { ret <2 x i64> %ret } -; Test v4f32 loads. -define <4 x float> @f5(ptr %ptr) { +; Test v8f16 loads. +define <8 x half> @f5(ptr %ptr) { ; CHECK-LABEL: f5: +; CHECK: vlerh %v24, 0(%r2) +; CHECK: br %r14 + %load = load <8 x half>, ptr %ptr + %ret = shufflevector <8 x half> %load, <8 x half> undef, + <8 x i32> + ret <8 x half> %ret +} + +; Test v4f32 loads. +define <4 x float> @f6(ptr %ptr) { +; CHECK-LABEL: f6: ; CHECK: vlerf %v24, 0(%r2) ; CHECK: br %r14 %load = load <4 x float>, ptr %ptr @@ -62,8 +74,8 @@ define <4 x float> @f5(ptr %ptr) { } ; Test v2f64 loads. -define <2 x double> @f6(ptr %ptr) { -; CHECK-LABEL: f6: +define <2 x double> @f7(ptr %ptr) { +; CHECK-LABEL: f7: ; CHECK: vlerg %v24, 0(%r2) ; CHECK: br %r14 %load = load <2 x double>, ptr %ptr @@ -73,8 +85,8 @@ define <2 x double> @f6(ptr %ptr) { } ; Test the highest aligned in-range offset. -define <4 x i32> @f7(ptr %base) { -; CHECK-LABEL: f7: +define <4 x i32> @f8(ptr %base) { +; CHECK-LABEL: f8: ; CHECK: vlerf %v24, 4080(%r2) ; CHECK: br %r14 %ptr = getelementptr <4 x i32>, ptr %base, i64 255 @@ -85,8 +97,8 @@ define <4 x i32> @f7(ptr %base) { } ; Test the highest unaligned in-range offset. -define <4 x i32> @f8(ptr %base) { -; CHECK-LABEL: f8: +define <4 x i32> @f9(ptr %base) { +; CHECK-LABEL: f9: ; CHECK: vlerf %v24, 4095(%r2) ; CHECK: br %r14 %addr = getelementptr i8, ptr %base, i64 4095 @@ -97,8 +109,8 @@ define <4 x i32> @f8(ptr %base) { } ; Test the next offset up, which requires separate address logic, -define <4 x i32> @f9(ptr %base) { -; CHECK-LABEL: f9: +define <4 x i32> @f10(ptr %base) { +; CHECK-LABEL: f10: ; CHECK: aghi %r2, 4096 ; CHECK: vlerf %v24, 0(%r2) ; CHECK: br %r14 @@ -110,8 +122,8 @@ define <4 x i32> @f9(ptr %base) { } ; Test negative offsets, which also require separate address logic, -define <4 x i32> @f10(ptr %base) { -; CHECK-LABEL: f10: +define <4 x i32> @f11(ptr %base) { +; CHECK-LABEL: f11: ; CHECK: aghi %r2, -16 ; CHECK: vlerf %v24, 0(%r2) ; CHECK: br %r14 @@ -123,8 +135,8 @@ define <4 x i32> @f10(ptr %base) { } ; Check that indexes are allowed. -define <4 x i32> @f11(ptr %base, i64 %index) { -; CHECK-LABEL: f11: +define <4 x i32> @f12(ptr %base, i64 %index) { +; CHECK-LABEL: f12: ; CHECK: vlerf %v24, 0(%r3,%r2) ; CHECK: br %r14 %addr = getelementptr i8, ptr %base, i64 %index diff --git a/llvm/test/CodeGen/SystemZ/vec-eswap-02.ll b/llvm/test/CodeGen/SystemZ/vec-eswap-02.ll index 2ce3aa5a42e8a..4eb6bcfcfb04b 100644 --- a/llvm/test/CodeGen/SystemZ/vec-eswap-02.ll +++ b/llvm/test/CodeGen/SystemZ/vec-eswap-02.ll @@ -50,9 +50,21 @@ define void @f4(<2 x i64> %val, ptr %ptr) { ret void } -; Test v4f32 stores. -define void @f5(<4 x float> %val, ptr %ptr) { +; Test v8f16 stores. +define void @f5(<8 x half> %val, ptr %ptr) { ; CHECK-LABEL: f5: +; CHECK: vsterh %v24, 0(%r2) +; CHECK: br %r14 + %swap = shufflevector <8 x half> %val, <8 x half> undef, + <8 x i32> + store <8 x half> %swap, ptr %ptr + ret void +} + +; Test v4f32 stores. +define void @f6(<4 x float> %val, ptr %ptr) { +; CHECK-LABEL: f6: ; CHECK: vsterf %v24, 0(%r2) ; CHECK: br %r14 %swap = shufflevector <4 x float> %val, <4 x float> undef, @@ -62,8 +74,8 @@ define void @f5(<4 x float> %val, ptr %ptr) { } ; Test v2f64 stores. -define void @f6(<2 x double> %val, ptr %ptr) { -; CHECK-LABEL: f6: +define void @f7(<2 x double> %val, ptr %ptr) { +; CHECK-LABEL: f7: ; CHECK: vsterg %v24, 0(%r2) ; CHECK: br %r14 %swap = shufflevector <2 x double> %val, <2 x double> undef, @@ -73,8 +85,8 @@ define void @f6(<2 x double> %val, ptr %ptr) { } ; Test the highest aligned in-range offset. -define void @f7(<4 x i32> %val, ptr %base) { -; CHECK-LABEL: f7: +define void @f8(<4 x i32> %val, ptr %base) { +; CHECK-LABEL: f8: ; CHECK: vsterf %v24, 4080(%r2) ; CHECK: br %r14 %ptr = getelementptr <4 x i32>, ptr %base, i64 255 @@ -85,8 +97,8 @@ define void @f7(<4 x i32> %val, ptr %base) { } ; Test the highest unaligned in-range offset. -define void @f8(<4 x i32> %val, ptr %base) { -; CHECK-LABEL: f8: +define void @f9(<4 x i32> %val, ptr %base) { +; CHECK-LABEL: f9: ; CHECK: vsterf %v24, 4095(%r2) ; CHECK: br %r14 %addr = getelementptr i8, ptr %base, i64 4095 @@ -97,8 +109,8 @@ define void @f8(<4 x i32> %val, ptr %base) { } ; Test the next offset up, which requires separate address logic, -define void @f9(<4 x i32> %val, ptr %base) { -; CHECK-LABEL: f9: +define void @f10(<4 x i32> %val, ptr %base) { +; CHECK-LABEL: f10: ; CHECK: aghi %r2, 4096 ; CHECK: vsterf %v24, 0(%r2) ; CHECK: br %r14 @@ -110,8 +122,8 @@ define void @f9(<4 x i32> %val, ptr %base) { } ; Test negative offsets, which also require separate address logic, -define void @f10(<4 x i32> %val, ptr %base) { -; CHECK-LABEL: f10: +define void @f11(<4 x i32> %val, ptr %base) { +; CHECK-LABEL: f11: ; CHECK: aghi %r2, -16 ; CHECK: vsterf %v24, 0(%r2) ; CHECK: br %r14 @@ -123,8 +135,8 @@ define void @f10(<4 x i32> %val, ptr %base) { } ; Check that indexes are allowed. -define void @f11(<4 x i32> %val, ptr %base, i64 %index) { -; CHECK-LABEL: f11: +define void @f12(<4 x i32> %val, ptr %base, i64 %index) { +; CHECK-LABEL: f12: ; CHECK: vsterf %v24, 0(%r3,%r2) ; CHECK: br %r14 %addr = getelementptr i8, ptr %base, i64 %index @@ -133,4 +145,3 @@ define void @f11(<4 x i32> %val, ptr %base, i64 %index) { store <4 x i32> %swap, ptr %addr, align 1 ret void } - diff --git a/llvm/test/CodeGen/SystemZ/vec-move-04.ll b/llvm/test/CodeGen/SystemZ/vec-move-04.ll index 27c9e5f71f403..400ca0b58bda9 100644 --- a/llvm/test/CodeGen/SystemZ/vec-move-04.ll +++ b/llvm/test/CodeGen/SystemZ/vec-move-04.ll @@ -110,19 +110,52 @@ define <2 x i64> @f12(<2 x i64> %val, i64 %element, i32 %index) { ret <2 x i64> %ret } -; Test v4f32 insertion into the first element. -define <4 x float> @f13(<4 x float> %val, float %element) { +; Test v8f16 insertion into the first element. +define <8 x half> @f13(<8 x half> %val, half %element) { ; CHECK-LABEL: f13: +; CHECK: lgdr %r0, %f0 +; CHECK: srlg %r0, %r0, 48 +; CHECK: vlvgh %v24, %r0, 0 +; CHECK: br %r14 + %ret = insertelement <8 x half> %val, half %element, i32 0 + ret <8 x half> %ret +} + +; Test v8f16 insertion into the last element. +define <8 x half> @f14(<8 x half> %val, half %element) { +; CHECK-LABEL: f14: +; CHECK: lgdr %r0, %f0 +; CHECK: srlg %r0, %r0, 48 +; CHECK: vlvgh %v24, %r0, 7 +; CHECK: br %r14 + %ret = insertelement <8 x half> %val, half %element, i32 7 + ret <8 x half> %ret +} + +; Test v8f16 insertion into a variable element. +define <8 x half> @f15(<8 x half> %val, half %element, i32 %index) { +; CHECK-LABEL: f15: +; CHECK: lgdr %r0, %f0 +; CHECK: srlg %r0, %r0, 48 +; CHECK: vlvgh %v24, %r0, 0(%r2) +; CHECK: br %r14 + %ret = insertelement <8 x half> %val, half %element, i32 %index + ret <8 x half> %ret +} + +; Test v4f32 insertion into the first element. +define <4 x float> @f16(<4 x float> %val, float %element) { +; CHECK-LABEL: f16: ; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0 ; CHECK: vlvgf %v24, [[REG]], 0 ; CHECK: br %r14 - %ret = insertelement <4 x float> %val, float %element, i32 0 - ret <4 x float> %ret +%ret = insertelement <4 x float> %val, float %element, i32 0 +ret <4 x float> %ret } ; Test v4f32 insertion into the last element. -define <4 x float> @f14(<4 x float> %val, float %element) { -; CHECK-LABEL: f14: +define <4 x float> @f17(<4 x float> %val, float %element) { +; CHECK-LABEL: f17: ; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0 ; CHECK: vlvgf %v24, [[REG]], 3 ; CHECK: br %r14 @@ -131,8 +164,8 @@ define <4 x float> @f14(<4 x float> %val, float %element) { } ; Test v4f32 insertion into a variable element. -define <4 x float> @f15(<4 x float> %val, float %element, i32 %index) { -; CHECK-LABEL: f15: +define <4 x float> @f18(<4 x float> %val, float %element, i32 %index) { +; CHECK-LABEL: f18: ; CHECK: vlgvf [[REG:%r[0-5]]], %v0, 0 ; CHECK: vlvgf %v24, [[REG]], 0(%r2) ; CHECK: br %r14 @@ -141,8 +174,8 @@ define <4 x float> @f15(<4 x float> %val, float %element, i32 %index) { } ; Test v2f64 insertion into the first element. -define <2 x double> @f16(<2 x double> %val, double %element) { -; CHECK-LABEL: f16: +define <2 x double> @f19(<2 x double> %val, double %element) { +; CHECK-LABEL: f19: ; CHECK: vpdi %v24, %v0, %v24, 1 ; CHECK: br %r14 %ret = insertelement <2 x double> %val, double %element, i32 0 @@ -150,8 +183,8 @@ define <2 x double> @f16(<2 x double> %val, double %element) { } ; Test v2f64 insertion into the last element. -define <2 x double> @f17(<2 x double> %val, double %element) { -; CHECK-LABEL: f17: +define <2 x double> @f20(<2 x double> %val, double %element) { +; CHECK-LABEL: f20: ; CHECK: vpdi %v24, %v24, %v0, 0 ; CHECK: br %r14 %ret = insertelement <2 x double> %val, double %element, i32 1 @@ -159,8 +192,8 @@ define <2 x double> @f17(<2 x double> %val, double %element) { } ; Test v2f64 insertion into a variable element. -define <2 x double> @f18(<2 x double> %val, double %element, i32 %index) { -; CHECK-LABEL: f18: +define <2 x double> @f21(<2 x double> %val, double %element, i32 %index) { +; CHECK-LABEL: f21: ; CHECK: lgdr [[REG:%r[0-5]]], %f0 ; CHECK: vlvgg %v24, [[REG]], 0(%r2) ; CHECK: br %r14 @@ -169,8 +202,8 @@ define <2 x double> @f18(<2 x double> %val, double %element, i32 %index) { } ; Test v16i8 insertion into a variable element plus one. -define <16 x i8> @f19(<16 x i8> %val, i8 %element, i32 %index) { -; CHECK-LABEL: f19: +define <16 x i8> @f22(<16 x i8> %val, i8 %element, i32 %index) { +; CHECK-LABEL: f22: ; CHECK: vlvgb %v24, %r2, 1(%r3) ; CHECK: br %r14 %add = add i32 %index, 1 diff --git a/llvm/test/CodeGen/SystemZ/vec-move-05.ll b/llvm/test/CodeGen/SystemZ/vec-move-05.ll index 99871196d685e..4e092da3f7070 100644 --- a/llvm/test/CodeGen/SystemZ/vec-move-05.ll +++ b/llvm/test/CodeGen/SystemZ/vec-move-05.ll @@ -150,18 +150,54 @@ define i64 @f16(<2 x i64> %val, i32 %index) { ret i64 %ret } -; Test v4f32 extraction of element 0. -define float @f17(<4 x float> %val) { +; Test v8f16 extraction of first element. +define half @f17(<8 x half> %val) { ; CHECK-LABEL: f17: ; CHECK: vlr %v0, %v24 +; CHECK: br %r14 + %ret = extractelement <8 x half> %val, i32 0 + ret half %ret +} + +; Test v8f16 extraction of last element. +define half @f18(<8 x half> %val) { +; CHECK-LABEL: f18: +; CHECK: vreph %v0, %v24, 7 +; CHECK: br %r14 + %ret = extractelement <8 x half> %val, i32 7 + ret half %ret +} + +; Test v8f16 extractions of an absurd element number. This must compile +; but we don't care what it does. +define half @f19(<8 x half> %val) { + %ret = extractelement <8 x half> %val, i32 100000 + ret half %ret +} + +; Test v8f16 extraction of a variable element. +define half @f20(<8 x half> %val, i32 %index) { +; CHECK-LABEL: f20: +; CHECK: vlgvh %r0, %v24, 0(%r2) +; CHECK: sllg %r0, %r0, 48 +; CHECK: ldgr %f0, %r0 +; CHECK: br %r14 + %ret = extractelement <8 x half> %val, i32 %index + ret half %ret +} + +; Test v4f32 extraction of element 0. +define float @f21(<4 x float> %val) { +; CHECK-LABEL: f21: +; CHECK: vlr %v0, %v24 ; CHECK: br %r14 %ret = extractelement <4 x float> %val, i32 0 ret float %ret } ; Test v4f32 extraction of element 1. -define float @f18(<4 x float> %val) { -; CHECK-LABEL: f18: +define float @f22(<4 x float> %val) { +; CHECK-LABEL: f22: ; CHECK: vrepf %v0, %v24, 1 ; CHECK: br %r14 %ret = extractelement <4 x float> %val, i32 1 @@ -169,8 +205,8 @@ define float @f18(<4 x float> %val) { } ; Test v4f32 extraction of element 2. -define float @f19(<4 x float> %val) { -; CHECK-LABEL: f19: +define float @f23(<4 x float> %val) { +; CHECK-LABEL: f23: ; CHECK: vrepf %v0, %v24, 2 ; CHECK: br %r14 %ret = extractelement <4 x float> %val, i32 2 @@ -178,8 +214,8 @@ define float @f19(<4 x float> %val) { } ; Test v4f32 extraction of element 3. -define float @f20(<4 x float> %val) { -; CHECK-LABEL: f20: +define float @f24(<4 x float> %val) { +; CHECK-LABEL: f24: ; CHECK: vrepf %v0, %v24, 3 ; CHECK: br %r14 %ret = extractelement <4 x float> %val, i32 3 @@ -188,14 +224,14 @@ define float @f20(<4 x float> %val) { ; Test v4f32 extractions of an absurd element number. This must compile ; but we don't care what it does. -define float @f21(<4 x float> %val) { +define float @f25(<4 x float> %val) { %ret = extractelement <4 x float> %val, i32 100000 ret float %ret } ; Test v4f32 extraction of a variable element. -define float @f22(<4 x float> %val, i32 %index) { -; CHECK-LABEL: f22: +define float @f26(<4 x float> %val, i32 %index) { +; CHECK-LABEL: f26: ; CHECK: vlgvf [[REG:%r[0-5]]], %v24, 0(%r2) ; CHECK: vlvgf %v0, [[REG]], 0 ; CHECK: br %r14 @@ -204,8 +240,8 @@ define float @f22(<4 x float> %val, i32 %index) { } ; Test v2f64 extraction of the first element. -define double @f23(<2 x double> %val) { -; CHECK-LABEL: f23: +define double @f27(<2 x double> %val) { +; CHECK-LABEL: f27: ; CHECK: vlr %v0, %v24 ; CHECK: br %r14 %ret = extractelement <2 x double> %val, i32 0 @@ -213,8 +249,8 @@ define double @f23(<2 x double> %val) { } ; Test v2f64 extraction of the last element. -define double @f24(<2 x double> %val) { -; CHECK-LABEL: f24: +define double @f28(<2 x double> %val) { +; CHECK-LABEL: f28: ; CHECK: vrepg %v0, %v24, 1 ; CHECK: br %r14 %ret = extractelement <2 x double> %val, i32 1 @@ -223,14 +259,14 @@ define double @f24(<2 x double> %val) { ; Test v2f64 extractions of an absurd element number. This must compile ; but we don't care what it does. -define double @f25(<2 x double> %val) { +define double @f29(<2 x double> %val) { %ret = extractelement <2 x double> %val, i32 100000 ret double %ret } ; Test v2f64 extraction of a variable element. -define double @f26(<2 x double> %val, i32 %index) { -; CHECK-LABEL: f26: +define double @f30(<2 x double> %val, i32 %index) { +; CHECK-LABEL: f30: ; CHECK: vlgvg [[REG:%r[0-5]]], %v24, 0(%r2) ; CHECK: ldgr %f0, [[REG]] ; CHECK: br %r14 @@ -239,8 +275,8 @@ define double @f26(<2 x double> %val, i32 %index) { } ; Test v16i8 extraction of a variable element with an offset. -define i8 @f27(<16 x i8> %val, i32 %index) { -; CHECK-LABEL: f27: +define i8 @f31(<16 x i8> %val, i32 %index) { +; CHECK-LABEL: f31: ; CHECK: vlgvb %r2, %v24, 1(%r2) ; CHECK: br %r14 %add = add i32 %index, 1 diff --git a/llvm/test/CodeGen/SystemZ/vec-move-07.ll b/llvm/test/CodeGen/SystemZ/vec-move-07.ll index b0d06f782dee7..638148ef5f29d 100644 --- a/llvm/test/CodeGen/SystemZ/vec-move-07.ll +++ b/llvm/test/CodeGen/SystemZ/vec-move-07.ll @@ -38,18 +38,27 @@ define <2 x i64> @f4(i64 %val) { ret <2 x i64> %ret } -; Test v4f32, which is just a move. -define <4 x float> @f5(float %val) { +; Test v8f16, which is just a move. +define <8 x half> @f5(half %val) { ; CHECK-LABEL: f5: ; CHECK: vlr %v24, %v0 +; CHECK: br %r14 + %ret = insertelement <8 x half> undef, half %val, i32 0 + ret <8 x half> %ret +} + +; Likewise v4f32, +define <4 x float> @f6(float %val) { +; CHECK-LABEL: f6: +; CHECK: vlr %v24, %v0 ; CHECK: br %r14 %ret = insertelement <4 x float> undef, float %val, i32 0 ret <4 x float> %ret } -; Likewise v2f64. -define <2 x double> @f6(double %val) { -; CHECK-LABEL: f6: +; and v2f64. +define <2 x double> @f7(double %val) { +; CHECK-LABEL: f7: ; CHECK: vlr %v24, %v0 ; CHECK: br %r14 %ret = insertelement <2 x double> undef, double %val, i32 0 diff --git a/llvm/test/CodeGen/SystemZ/vec-move-10.ll b/llvm/test/CodeGen/SystemZ/vec-move-10.ll index 3c3862bf9e192..5af836c696fc9 100644 --- a/llvm/test/CodeGen/SystemZ/vec-move-10.ll +++ b/llvm/test/CodeGen/SystemZ/vec-move-10.ll @@ -258,9 +258,76 @@ define void @f24(<2 x i64> %val, ptr %ptr, i32 %index) { ret void } -; Test v4f32 extraction from the first element. -define void @f25(<4 x float> %val, ptr %ptr) { +; Test v8f16 extraction from the first element. +define void @f25(<8 x half> %val, ptr %ptr) { ; CHECK-LABEL: f25: +; CHECK: vsteh %v24, 0(%r2), 0 +; CHECK: br %r14 + %element = extractelement <8 x half> %val, i32 0 + store half %element, ptr %ptr + ret void +} + +; Test v8f16 extraction from the last element. +define void @f26(<8 x half> %val, ptr %ptr) { +; CHECK-LABEL: f26: +; CHECK: vsteh %v24, 0(%r2), 7 +; CHECK: br %r14 + %element = extractelement <8 x half> %val, i32 7 + store half %element, ptr %ptr + ret void +} + +; Test v8f16 extraction of an invalid element. This must compile, +; but we don't care what it does. +define void @f27(<8 x half> %val, ptr %ptr) { +; CHECK-LABEL: f27: +; CHECK-NOT: vsteh %v24, 0(%r2), 8 +; CHECK: br %r14 + %element = extractelement <8 x half> %val, i32 8 + store half %element, ptr %ptr + ret void +} + +; Test v8f16 extraction with the highest in-range offset. +define void @f28(<8 x half> %val, ptr %base) { +; CHECK-LABEL: f28: +; CHECK: vsteh %v24, 4094(%r2), 2 +; CHECK: br %r14 + %ptr = getelementptr half, ptr %base, i32 2047 + %element = extractelement <8 x half> %val, i32 2 + store half %element, ptr %ptr + ret void +} + +; Test v8f16 extraction with the first ouf-of-range offset. +define void @f29(<8 x half> %val, ptr %base) { +; CHECK-LABEL: f29: +; CHECK: aghi %r2, 4096 +; CHECK: vsteh %v24, 0(%r2), 1 +; CHECK: br %r14 + %ptr = getelementptr half, ptr %base, i32 2048 + %element = extractelement <8 x half> %val, i32 1 + store half %element, ptr %ptr + ret void +} + +; Test v8f16 extraction from a variable element. +define void @f30(<8 x half> %val, ptr %ptr, i32 %index) { +; CHECK-LABEL: f30: +; CHECK: vlgvh %r0, %v24, 0(%r3) +; CHECK: sllg %r0, %r0, 48 +; CHECK: ldgr %f0, %r0 +; CHECK: vsteh %v0, 0(%r2), 0 +; CHECK: br %r14 + %element = extractelement <8 x half> %val, i32 %index + store half %element, ptr %ptr + ret void +} + +; Test v4f32 extraction from the first element. +define void @f31(<4 x float> %val, ptr %ptr) { +; CHECK-LABEL: f31: ; CHECK: vstef %v24, 0(%r2), 0 ; CHECK: br %r14 %element = extractelement <4 x float> %val, i32 0 @@ -269,8 +336,8 @@ define void @f25(<4 x float> %val, ptr %ptr) { } ; Test v4f32 extraction from the last element. -define void @f26(<4 x float> %val, ptr %ptr) { -; CHECK-LABEL: f26: +define void @f32(<4 x float> %val, ptr %ptr) { +; CHECK-LABEL: f32: ; CHECK: vstef %v24, 0(%r2), 3 ; CHECK: br %r14 %element = extractelement <4 x float> %val, i32 3 @@ -280,8 +347,8 @@ define void @f26(<4 x float> %val, ptr %ptr) { ; Test v4f32 extraction of an invalid element. This must compile, ; but we don't care what it does. -define void @f27(<4 x float> %val, ptr %ptr) { -; CHECK-LABEL: f27: +define void @f33(<4 x float> %val, ptr %ptr) { +; CHECK-LABEL: f33: ; CHECK-NOT: vstef %v24, 0(%r2), 4 ; CHECK: br %r14 %element = extractelement <4 x float> %val, i32 4 @@ -290,8 +357,8 @@ define void @f27(<4 x float> %val, ptr %ptr) { } ; Test v4f32 extraction with the highest in-range offset. -define void @f28(<4 x float> %val, ptr %base) { -; CHECK-LABEL: f28: +define void @f34(<4 x float> %val, ptr %base) { +; CHECK-LABEL: f34: ; CHECK: vstef %v24, 4092(%r2), 2 ; CHECK: br %r14 %ptr = getelementptr float, ptr %base, i32 1023 @@ -301,8 +368,8 @@ define void @f28(<4 x float> %val, ptr %base) { } ; Test v4f32 extraction with the first ouf-of-range offset. -define void @f29(<4 x float> %val, ptr %base) { -; CHECK-LABEL: f29: +define void @f35(<4 x float> %val, ptr %base) { +; CHECK-LABEL: f35: ; CHECK: aghi %r2, 4096 ; CHECK: vstef %v24, 0(%r2), 1 ; CHECK: br %r14 @@ -313,8 +380,8 @@ define void @f29(<4 x float> %val, ptr %base) { } ; Test v4f32 extraction from a variable element. -define void @f30(<4 x float> %val, ptr %ptr, i32 %index) { -; CHECK-LABEL: f30: +define void @f36(<4 x float> %val, ptr %ptr, i32 %index) { +; CHECK-LABEL: f36: ; CHECK-NOT: vstef ; CHECK: br %r14 %element = extractelement <4 x float> %val, i32 %index @@ -323,8 +390,8 @@ define void @f30(<4 x float> %val, ptr %ptr, i32 %index) { } ; Test v2f64 extraction from the first element. -define void @f32(<2 x double> %val, ptr %ptr) { -; CHECK-LABEL: f32: +define void @f37(<2 x double> %val, ptr %ptr) { +; CHECK-LABEL: f37: ; CHECK: vsteg %v24, 0(%r2), 0 ; CHECK: br %r14 %element = extractelement <2 x double> %val, i32 0 @@ -333,8 +400,8 @@ define void @f32(<2 x double> %val, ptr %ptr) { } ; Test v2f64 extraction from the last element. -define void @f33(<2 x double> %val, ptr %ptr) { -; CHECK-LABEL: f33: +define void @f38(<2 x double> %val, ptr %ptr) { +; CHECK-LABEL: f38: ; CHECK: vsteg %v24, 0(%r2), 1 ; CHECK: br %r14 %element = extractelement <2 x double> %val, i32 1 @@ -343,8 +410,8 @@ define void @f33(<2 x double> %val, ptr %ptr) { } ; Test v2f64 extraction with the highest in-range offset. -define void @f34(<2 x double> %val, ptr %base) { -; CHECK-LABEL: f34: +define void @f39(<2 x double> %val, ptr %base) { +; CHECK-LABEL: f39: ; CHECK: vsteg %v24, 4088(%r2), 1 ; CHECK: br %r14 %ptr = getelementptr double, ptr %base, i32 511 @@ -354,8 +421,8 @@ define void @f34(<2 x double> %val, ptr %base) { } ; Test v2f64 extraction with the first ouf-of-range offset. -define void @f35(<2 x double> %val, ptr %base) { -; CHECK-LABEL: f35: +define void @f40(<2 x double> %val, ptr %base) { +; CHECK-LABEL: f40: ; CHECK: aghi %r2, 4096 ; CHECK: vsteg %v24, 0(%r2), 0 ; CHECK: br %r14 @@ -366,8 +433,8 @@ define void @f35(<2 x double> %val, ptr %base) { } ; Test v2f64 extraction from a variable element. -define void @f36(<2 x double> %val, ptr %ptr, i32 %index) { -; CHECK-LABEL: f36: +define void @f41(<2 x double> %val, ptr %ptr, i32 %index) { +; CHECK-LABEL: f41: ; CHECK-NOT: vsteg ; CHECK: br %r14 %element = extractelement <2 x double> %val, i32 %index @@ -376,8 +443,8 @@ define void @f36(<2 x double> %val, ptr %ptr, i32 %index) { } ; Test a v4i32 scatter of the first element. -define void @f37(<4 x i32> %val, <4 x i32> %index, i64 %base) { -; CHECK-LABEL: f37: +define void @f42(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f42: ; CHECK: vscef %v24, 0(%v26,%r2), 0 ; CHECK: br %r14 %elem = extractelement <4 x i32> %index, i32 0 @@ -390,8 +457,8 @@ define void @f37(<4 x i32> %val, <4 x i32> %index, i64 %base) { } ; Test a v4i32 scatter of the last element. -define void @f38(<4 x i32> %val, <4 x i32> %index, i64 %base) { -; CHECK-LABEL: f38: +define void @f43(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f43: ; CHECK: vscef %v24, 0(%v26,%r2), 3 ; CHECK: br %r14 %elem = extractelement <4 x i32> %index, i32 3 @@ -404,8 +471,8 @@ define void @f38(<4 x i32> %val, <4 x i32> %index, i64 %base) { } ; Test a v4i32 scatter with the highest in-range offset. -define void @f39(<4 x i32> %val, <4 x i32> %index, i64 %base) { -; CHECK-LABEL: f39: +define void @f44(<4 x i32> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f44: ; CHECK: vscef %v24, 4095(%v26,%r2), 1 ; CHECK: br %r14 %elem = extractelement <4 x i32> %index, i32 1 @@ -419,8 +486,8 @@ define void @f39(<4 x i32> %val, <4 x i32> %index, i64 %base) { } ; Test a v2i64 scatter of the first element. -define void @f40(<2 x i64> %val, <2 x i64> %index, i64 %base) { -; CHECK-LABEL: f40: +define void @f45(<2 x i64> %val, <2 x i64> %index, i64 %base) { +; CHECK-LABEL: f45: ; CHECK: vsceg %v24, 0(%v26,%r2), 0 ; CHECK: br %r14 %elem = extractelement <2 x i64> %index, i32 0 @@ -432,8 +499,8 @@ define void @f40(<2 x i64> %val, <2 x i64> %index, i64 %base) { } ; Test a v2i64 scatter of the last element. -define void @f41(<2 x i64> %val, <2 x i64> %index, i64 %base) { -; CHECK-LABEL: f41: +define void @f46(<2 x i64> %val, <2 x i64> %index, i64 %base) { +; CHECK-LABEL: f46: ; CHECK: vsceg %v24, 0(%v26,%r2), 1 ; CHECK: br %r14 %elem = extractelement <2 x i64> %index, i32 1 @@ -445,8 +512,8 @@ define void @f41(<2 x i64> %val, <2 x i64> %index, i64 %base) { } ; Test a v4f32 scatter of the first element. -define void @f42(<4 x float> %val, <4 x i32> %index, i64 %base) { -; CHECK-LABEL: f42: +define void @f47(<4 x float> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f47: ; CHECK: vscef %v24, 0(%v26,%r2), 0 ; CHECK: br %r14 %elem = extractelement <4 x i32> %index, i32 0 @@ -459,8 +526,8 @@ define void @f42(<4 x float> %val, <4 x i32> %index, i64 %base) { } ; Test a v4f32 scatter of the last element. -define void @f43(<4 x float> %val, <4 x i32> %index, i64 %base) { -; CHECK-LABEL: f43: +define void @f48(<4 x float> %val, <4 x i32> %index, i64 %base) { +; CHECK-LABEL: f48: ; CHECK: vscef %v24, 0(%v26,%r2), 3 ; CHECK: br %r14 %elem = extractelement <4 x i32> %index, i32 3 @@ -473,8 +540,8 @@ define void @f43(<4 x float> %val, <4 x i32> %index, i64 %base) { } ; Test a v2f64 scatter of the first element. -define void @f44(<2 x double> %val, <2 x i64> %index, i64 %base) { -; CHECK-LABEL: f44: +define void @f49(<2 x double> %val, <2 x i64> %index, i64 %base) { +; CHECK-LABEL: f49: ; CHECK: vsceg %v24, 0(%v26,%r2), 0 ; CHECK: br %r14 %elem = extractelement <2 x i64> %index, i32 0 @@ -486,8 +553,8 @@ define void @f44(<2 x double> %val, <2 x i64> %index, i64 %base) { } ; Test a v2f64 scatter of the last element. -define void @f45(<2 x double> %val, <2 x i64> %index, i64 %base) { -; CHECK-LABEL: f45: +define void @f50(<2 x double> %val, <2 x i64> %index, i64 %base) { +; CHECK-LABEL: f50: ; CHECK: vsceg %v24, 0(%v26,%r2), 1 ; CHECK: br %r14 %elem = extractelement <2 x i64> %index, i32 1 diff --git a/llvm/test/CodeGen/SystemZ/vec-move-12.ll b/llvm/test/CodeGen/SystemZ/vec-move-12.ll index c862d86de64e8..2dadd5b0ff703 100644 --- a/llvm/test/CodeGen/SystemZ/vec-move-12.ll +++ b/llvm/test/CodeGen/SystemZ/vec-move-12.ll @@ -102,9 +102,19 @@ define <2 x i64> @f10(ptr %ptr) { ret <2 x i64> %ret } -; Test v4f32 insertion into an undef. -define <4 x float> @f11(ptr %ptr) { +; Test v8f16 insertion into an undef. +define <8 x half> @f11(ptr %ptr) { ; CHECK-LABEL: f11: +; CHECK: vlreph %v24, 0(%r2) +; CHECK: br %r14 + %val = load half, ptr %ptr + %ret = insertelement <8 x half> undef, half %val, i32 2 + ret <8 x half> %ret +} + +; Test v4f32 insertion into an undef. +define <4 x float> @f12(ptr %ptr) { +; CHECK-LABEL: f12: ; CHECK: vlrepf %v24, 0(%r2) ; CHECK: br %r14 %val = load float, ptr %ptr @@ -113,8 +123,8 @@ define <4 x float> @f11(ptr %ptr) { } ; Test v2f64 insertion into an undef. -define <2 x double> @f12(ptr %ptr) { -; CHECK-LABEL: f12: +define <2 x double> @f13(ptr %ptr) { +; CHECK-LABEL: f13: ; CHECK: vlrepg %v24, 0(%r2) ; CHECK: br %r14 %val = load double, ptr %ptr diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-01.ll b/llvm/test/CodeGen/SystemZ/vec-perm-01.ll index 4beec05eaece3..4d1c6306d2ccc 100644 --- a/llvm/test/CodeGen/SystemZ/vec-perm-01.ll +++ b/llvm/test/CodeGen/SystemZ/vec-perm-01.ll @@ -123,9 +123,42 @@ define <2 x i64> @f11(<2 x i64> %val) { ret <2 x i64> %ret } -; Test v4f32 splat of the first element. -define <4 x float> @f12(<4 x float> %val) { +; Test v8f16 splat of the first element. +define <8 x half> @f12(<8 x half> %val) { ; CHECK-LABEL: f12: +; CHECK: vreph %v24, %v24, 0 +; CHECK: br %r14 + %ret = shufflevector <8 x half> %val, <8 x half> undef, + <8 x i32> zeroinitializer + ret <8 x half> %ret +} + +; Test v8f16 splat of the last element. +define <8 x half> @f13(<8 x half> %val) { +; CHECK-LABEL: f13: +; CHECK: vreph %v24, %v24, 7 +; CHECK: br %r14 + %ret = shufflevector <8 x half> %val, <8 x half> undef, + <8 x i32> + ret <8 x half> %ret +} + +; Test v8f16 splat of an arbitrary element, using the second operand of +; the shufflevector. +define <8 x half> @f14(<8 x half> %val) { +; CHECK-LABEL: f14: +; CHECK: vreph %v24, %v24, 2 +; CHECK: br %r14 + %ret = shufflevector <8 x half> undef, <8 x half> %val, + <8 x i32> + ret <8 x half> %ret +} + +; Test v4f32 splat of the first element. +define <4 x float> @f15(<4 x float> %val) { +; CHECK-LABEL: f15: ; CHECK: vrepf %v24, %v24, 0 ; CHECK: br %r14 %ret = shufflevector <4 x float> %val, <4 x float> undef, @@ -134,8 +167,8 @@ define <4 x float> @f12(<4 x float> %val) { } ; Test v4f32 splat of the last element. -define <4 x float> @f13(<4 x float> %val) { -; CHECK-LABEL: f13: +define <4 x float> @f16(<4 x float> %val) { +; CHECK-LABEL: f16: ; CHECK: vrepf %v24, %v24, 3 ; CHECK: br %r14 %ret = shufflevector <4 x float> %val, <4 x float> undef, @@ -145,8 +178,8 @@ define <4 x float> @f13(<4 x float> %val) { ; Test v4f32 splat of an arbitrary element, using the second operand of ; the shufflevector. -define <4 x float> @f14(<4 x float> %val) { -; CHECK-LABEL: f14: +define <4 x float> @f17(<4 x float> %val) { +; CHECK-LABEL: f17: ; CHECK: vrepf %v24, %v24, 1 ; CHECK: br %r14 %ret = shufflevector <4 x float> undef, <4 x float> %val, @@ -155,8 +188,8 @@ define <4 x float> @f14(<4 x float> %val) { } ; Test v2f64 splat of the first element. -define <2 x double> @f15(<2 x double> %val) { -; CHECK-LABEL: f15: +define <2 x double> @f18(<2 x double> %val) { +; CHECK-LABEL: f18: ; CHECK: vrepg %v24, %v24, 0 ; CHECK: br %r14 %ret = shufflevector <2 x double> %val, <2 x double> undef, @@ -165,8 +198,8 @@ define <2 x double> @f15(<2 x double> %val) { } ; Test v2f64 splat of the last element. -define <2 x double> @f16(<2 x double> %val) { -; CHECK-LABEL: f16: +define <2 x double> @f19(<2 x double> %val) { +; CHECK-LABEL: f19: ; CHECK: vrepg %v24, %v24, 1 ; CHECK: br %r14 %ret = shufflevector <2 x double> %val, <2 x double> undef, diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-03.ll b/llvm/test/CodeGen/SystemZ/vec-perm-03.ll index aa18923055575..91727315e2ef7 100644 --- a/llvm/test/CodeGen/SystemZ/vec-perm-03.ll +++ b/llvm/test/CodeGen/SystemZ/vec-perm-03.ll @@ -158,9 +158,48 @@ define <2 x i64> @f12(ptr %base) { ret <2 x i64> %ret } -; Test a v4f32 replicating load with no offset. -define <4 x float> @f13(ptr %ptr) { +; Test a v8f16 replicating load with no offset. +define <8 x half> @f13(ptr %ptr) { ; CHECK-LABEL: f13: +; CHECK: vlreph %v24, 0(%r2) +; CHECK: br %r14 + %scalar = load half, ptr %ptr + %val = insertelement <8 x half> undef, half %scalar, i32 0 + %ret = shufflevector <8 x half> %val, <8 x half> undef, + <8 x i32> zeroinitializer + ret <8 x half> %ret +} + +; Test a v8f16 replicating load with the maximum in-range offset. +define <8 x half> @f14(ptr %base) { +; CHECK-LABEL: f14: +; CHECK: vlreph %v24, 4094(%r2) +; CHECK: br %r14 + %ptr = getelementptr half, ptr %base, i64 2047 + %scalar = load half, ptr %ptr + %val = insertelement <8 x half> undef, half %scalar, i32 0 + %ret = shufflevector <8 x half> %val, <8 x half> undef, + <8 x i32> zeroinitializer + ret <8 x half> %ret +} + +; Test a v8f16 replicating load with the first out-of-range offset. +define <8 x half> @f15(ptr %base) { +; CHECK-LABEL: f15: +; CHECK: aghi %r2, 4096 +; CHECK: vlreph %v24, 0(%r2) +; CHECK: br %r14 + %ptr = getelementptr half, ptr %base, i64 2048 + %scalar = load half, ptr %ptr + %val = insertelement <8 x half> undef, half %scalar, i32 0 + %ret = shufflevector <8 x half> %val, <8 x half> undef, + <8 x i32> zeroinitializer + ret <8 x half> %ret +} + +; Test a v4f32 replicating load with no offset. +define <4 x float> @f16(ptr %ptr) { +; CHECK-LABEL: f16: ; CHECK: vlrepf %v24, 0(%r2) ; CHECK: br %r14 %scalar = load float, ptr %ptr @@ -171,8 +210,8 @@ define <4 x float> @f13(ptr %ptr) { } ; Test a v4f32 replicating load with the maximum in-range offset. -define <4 x float> @f14(ptr %base) { -; CHECK-LABEL: f14: +define <4 x float> @f17(ptr %base) { +; CHECK-LABEL: f17: ; CHECK: vlrepf %v24, 4092(%r2) ; CHECK: br %r14 %ptr = getelementptr float, ptr %base, i64 1023 @@ -184,8 +223,8 @@ define <4 x float> @f14(ptr %base) { } ; Test a v4f32 replicating load with the first out-of-range offset. -define <4 x float> @f15(ptr %base) { -; CHECK-LABEL: f15: +define <4 x float> @f18(ptr %base) { +; CHECK-LABEL: f18: ; CHECK: aghi %r2, 4096 ; CHECK: vlrepf %v24, 0(%r2) ; CHECK: br %r14 @@ -198,8 +237,8 @@ define <4 x float> @f15(ptr %base) { } ; Test a v2f64 replicating load with no offset. -define <2 x double> @f16(ptr %ptr) { -; CHECK-LABEL: f16: +define <2 x double> @f19(ptr %ptr) { +; CHECK-LABEL: f19: ; CHECK: vlrepg %v24, 0(%r2) ; CHECK: br %r14 %scalar = load double, ptr %ptr @@ -210,8 +249,8 @@ define <2 x double> @f16(ptr %ptr) { } ; Test a v2f64 replicating load with the maximum in-range offset. -define <2 x double> @f17(ptr %base) { -; CHECK-LABEL: f17: +define <2 x double> @f20(ptr %base) { +; CHECK-LABEL: f20: ; CHECK: vlrepg %v24, 4088(%r2) ; CHECK: br %r14 %ptr = getelementptr double, ptr %base, i32 511 @@ -223,8 +262,8 @@ define <2 x double> @f17(ptr %base) { } ; Test a v2f64 replicating load with the first out-of-range offset. -define <2 x double> @f18(ptr %base) { -; CHECK-LABEL: f18: +define <2 x double> @f21(ptr %base) { +; CHECK-LABEL: f21: ; CHECK: aghi %r2, 4096 ; CHECK: vlrepg %v24, 0(%r2) ; CHECK: br %r14 @@ -237,8 +276,8 @@ define <2 x double> @f18(ptr %base) { } ; Test a v16i8 replicating load with an index. -define <16 x i8> @f19(ptr %base, i64 %index) { -; CHECK-LABEL: f19: +define <16 x i8> @f22(ptr %base, i64 %index) { +; CHECK-LABEL: f22: ; CHECK: vlrepb %v24, 1023(%r3,%r2) ; CHECK: br %r14 %ptr1 = getelementptr i8, ptr %base, i64 %index