diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 14879e78175087..aed9bffc551f47 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4106,7 +4106,7 @@ InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const { std::optional FPValReg; if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) { - if (TII.isInlineConstant(FPValReg->Value.bitcastToAPInt())) { + if (TII.isInlineConstant(FPValReg->Value)) { return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue()); }}}; diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 79ad6ddf7861fc..7d4233c442188d 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1927,8 +1927,12 @@ static bool isInlineableLiteralOp16(int64_t Val, MVT VT, bool HasInv2Pi) { return isInlinableIntLiteral(Val); } - // f16/v2f16 operands work correctly for all values. - return AMDGPU::isInlinableLiteral16(Val, HasInv2Pi); + if (VT.getScalarType() == MVT::f16) + return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi); + + assert(VT.getScalarType() == MVT::bf16); + + return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi); } bool AMDGPUOperand::isInlinableImm(MVT type) const { @@ -2277,15 +2281,26 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo return; case AMDGPU::OPERAND_REG_IMM_INT16: - case AMDGPU::OPERAND_REG_IMM_FP16: - case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + if (isSafeTruncation(Val, 16) && + AMDGPU::isInlinableIntLiteral(static_cast(Val))) { + Inst.addOperand(MCOperand::createImm(Val)); + setImmKindConst(); + return; + } + + Inst.addOperand(MCOperand::createImm(Val & 0xffff)); + setImmKindLiteral(); + return; + + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_AC_FP16: if (isSafeTruncation(Val, 16) && - AMDGPU::isInlinableLiteral16(static_cast(Val), - AsmParser->hasInv2PiInlineImm())) { + AMDGPU::isInlinableLiteralFP16(static_cast(Val), + AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Val)); setImmKindConst(); return; @@ -2296,12 +2311,17 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo return; case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: { + assert(isSafeTruncation(Val, 16)); + assert(AMDGPU::isInlinableIntLiteral(static_cast(Val))); + Inst.addOperand(MCOperand::createImm(Val)); + return; + } case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { assert(isSafeTruncation(Val, 16)); - assert(AMDGPU::isInlinableLiteral16(static_cast(Val), - AsmParser->hasInv2PiInlineImm())); + assert(AMDGPU::isInlinableLiteralFP16(static_cast(Val), + AsmParser->hasInv2PiInlineImm())); Inst.addOperand(MCOperand::createImm(Val)); return; @@ -3429,7 +3449,13 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst, OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) return AMDGPU::isInlinableLiteralV2F16(Val); - return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm()); + if (OperandType == AMDGPU::OPERAND_REG_IMM_FP16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_C_FP16 || + OperandType == AMDGPU::OPERAND_REG_INLINE_AC_FP16 || + OperandType == AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED) + return AMDGPU::isInlinableLiteralFP16(Val, hasInv2PiInlineImm()); + + llvm_unreachable("invalid operand type"); } default: llvm_unreachable("invalid operand size"); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index 4ab3aa5a0240ad..d76c99845000b0 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -462,8 +462,8 @@ void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm, // This must accept a 32-bit immediate value to correctly handle packed 16-bit // operations. -static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O) { +static bool printImmediateFP16(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O) { if (Imm == 0x3C00) O << "1.0"; else if (Imm == 0xBC00) @@ -488,7 +488,7 @@ static bool printImmediateFloat16(uint32_t Imm, const MCSubtargetInfo &STI, return true; } -void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, +void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, uint8_t OpType, const MCSubtargetInfo &STI, raw_ostream &O) { int16_t SImm = static_cast(Imm); @@ -498,8 +498,17 @@ void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, } uint16_t HImm = static_cast(Imm); - if (printImmediateFloat16(HImm, STI, O)) - return; + switch (OpType) { + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: + if (printImmediateFP16(HImm, STI, O)) + return; + break; + default: + llvm_unreachable("bad operand type"); + } uint64_t Imm16 = static_cast(Imm); O << formatHex(Imm16); @@ -525,7 +534,7 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType, case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: if (isUInt<16>(Imm) && - printImmediateFloat16(static_cast(Imm), STI, O)) + printImmediateFP16(static_cast(Imm), STI, O)) return; break; default: @@ -797,7 +806,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo, case AMDGPU::OPERAND_REG_INLINE_AC_FP16: case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: - printImmediate16(Op.getImm(), STI, O); + printImmediate16(Op.getImm(), OpTy, STI, O); break; case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index e91ff86b219a0c..d14a36e77b6612 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -86,8 +86,8 @@ class AMDGPUInstPrinter : public MCInstPrinter { raw_ostream &O); void printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); - void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, - raw_ostream &O); + void printImmediate16(uint32_t Imm, uint8_t OpType, + const MCSubtargetInfo &STI, raw_ostream &O); void printImmediateV216(uint32_t Imm, uint8_t OpType, const MCSubtargetInfo &STI, raw_ostream &O); bool printImmediateFloat32(uint32_t Imm, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 56f0e716423955..3a6d059ac6acb6 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -12965,10 +12965,8 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - if ((!K0->hasOneUse() || - TII->isInlineConstant(K0->getValueAPF().bitcastToAPInt())) && - (!K1->hasOneUse() || - TII->isInlineConstant(K1->getValueAPF().bitcastToAPInt()))) { + if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) && + (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) { return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var, SDValue(K0, 0), SDValue(K1, 0)); } @@ -15391,16 +15389,22 @@ bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint, llvm_unreachable("Invalid asm constraint"); } -bool SITargetLowering::checkAsmConstraintValA(SDValue Op, - uint64_t Val, +bool SITargetLowering::checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize) const { unsigned Size = std::min(Op.getScalarValueSizeInBits(), MaxSize); bool HasInv2Pi = Subtarget->hasInv2PiInlineImm(); - if ((Size == 16 && AMDGPU::isInlinableLiteral16(Val, HasInv2Pi)) || - (Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) || - (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) { - return true; + if (Size == 16) { + MVT VT = Op.getSimpleValueType(); + if (VT == MVT::i16 && AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi)) + return true; + if (VT == MVT::f16 && AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi)) + return true; + if (VT == MVT::bf16 && AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi)) + return true; } + if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) || + (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) + return true; return false; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f5ec831234f2f9..e88189ac5d3f76 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4121,8 +4121,27 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { ST.hasInv2PiInlineImm()); case 16: return ST.has16BitInsts() && - AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), - ST.hasInv2PiInlineImm()); + AMDGPU::isInlinableLiteralI16(Imm.getSExtValue(), + ST.hasInv2PiInlineImm()); + default: + llvm_unreachable("invalid bitwidth"); + } +} + +bool SIInstrInfo::isInlineConstant(const APFloat &Imm) const { + APInt IntImm = Imm.bitcastToAPInt(); + bool HasInv2Pi = ST.hasInv2PiInlineImm(); + switch (IntImm.getBitWidth()) { + case 32: + case 64: + return isInlineConstant(IntImm); + case 16: + if (Imm.isIEEE()) + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteralFP16(IntImm.getSExtValue(), HasInv2Pi); + else + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteralBF16(IntImm.getSExtValue(), HasInv2Pi); default: llvm_unreachable("invalid bitwidth"); } @@ -4196,7 +4215,7 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, // constants in these cases int16_t Trunc = static_cast(Imm); return ST.has16BitInsts() && - AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); + AMDGPU::isInlinableLiteralFP16(Trunc, ST.hasInv2PiInlineImm()); } return false; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 7a6c28421c8d7a..caf40a0a4627ec 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -966,9 +966,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo { bool isInlineConstant(const APInt &Imm) const; - bool isInlineConstant(const APFloat &Imm) const { - return isInlineConstant(Imm.bitcastToAPInt()); - } + bool isInlineConstant(const APFloat &Imm) const; // Returns true if this non-register operand definitely does not need to be // encoded as a 32-bit literal. Note that this function handles all kinds of diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 800dfcf3076dd3..ced2aef88ef3b3 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -2652,13 +2652,28 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) { (Val == 0x3e22f983 && HasInv2Pi); } -bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { +bool isInlinableLiteralI16(int16_t Literal, bool HasInv2Pi) { + if (!HasInv2Pi) + return false; + if (isInlinableIntLiteral(Literal)) + return true; + return (Literal == static_cast(llvm::bit_cast(0.0f))) || + (Literal == static_cast(llvm::bit_cast(1.0f))) || + (Literal == static_cast(llvm::bit_cast(-1.0f))) || + (Literal == static_cast(llvm::bit_cast(0.5f))) || + (Literal == static_cast(llvm::bit_cast(-0.5f))) || + (Literal == static_cast(llvm::bit_cast(2.0f))) || + (Literal == static_cast(llvm::bit_cast(-2.0f))) || + (Literal == static_cast(llvm::bit_cast(4.0f))) || + (Literal == static_cast(llvm::bit_cast(-4.0f))) || + (Literal == static_cast(0x3e22f983)); +} + +bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi) { if (!HasInv2Pi) return false; - if (isInlinableIntLiteral(Literal)) return true; - uint16_t Val = static_cast(Literal); return Val == 0x3C00 || // 1.0 Val == 0xBC00 || // -1.0 @@ -2671,6 +2686,23 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { Val == 0x3118; // 1/2pi } +bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi) { + if (!HasInv2Pi) + return false; + if (isInlinableIntLiteral(Literal)) + return true; + uint16_t Val = static_cast(Literal); + return Val == 0x3F00 || // 0.5 + Val == 0xBF00 || // -0.5 + Val == 0x3F80 || // 1.0 + Val == 0xBF80 || // -1.0 + Val == 0x4000 || // 2.0 + Val == 0xC000 || // -2.0 + Val == 0x4080 || // 4.0 + Val == 0xC080 || // -4.0 + Val == 0x3E22; // 1.0 / (2.0 * pi) +} + std::optional getInlineEncodingV216(bool IsFloat, uint32_t Literal) { // Unfortunately, the Instruction Set Architecture Reference Guide is // misleading about how the inline operands work for (packed) 16-bit diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index b56025f55519a5..f8762c4cee9c06 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -1374,7 +1374,13 @@ LLVM_READNONE bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi); LLVM_READNONE -bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi); +bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi); + +LLVM_READNONE +bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi); + +LLVM_READNONE +bool isInlinableLiteralI16(int16_t Literal, bool HasInv2Pi); LLVM_READNONE std::optional getInlineEncodingV2I16(uint32_t Literal); diff --git a/llvm/test/CodeGen/AMDGPU/immv216.ll b/llvm/test/CodeGen/AMDGPU/immv216.ll index b66ca71a327495..ae51c3edf1c7e7 100644 --- a/llvm/test/CodeGen/AMDGPU/immv216.ll +++ b/llvm/test/CodeGen/AMDGPU/immv216.ll @@ -577,40 +577,40 @@ define amdgpu_kernel void @add_inline_imm_64_v2f16(ptr addrspace(1) %out, <2 x h } ; GCN-LABEL: {{^}}mul_inline_imm_0.5_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x38003800 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x3800 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0x38003800, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x38] +; GFX10: v_pk_mul_lo_u16 v0, 0x3800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x38,0x00,0x00] define <2 x i16> @mul_inline_imm_0.5_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y } ; GCN-LABEL: {{^}}mul_inline_imm_neg_0.5_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xb800b800 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0xb800 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0xb800b800, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0x00,0xb8] +; GFX10: v_pk_mul_lo_u16 v0, 0xffffb800, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xb8,0xff,0xff] define <2 x i16> @mul_inline_imm_neg_0.5_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y } ; GCN-LABEL: {{^}}mul_inline_imm_1.0_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x3c003c00 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x3c00 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0x3c003c00, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x3c] +; GFX10: v_pk_mul_lo_u16 v0, 0x3c00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x3c,0x00,0x00] define <2 x i16> @mul_inline_imm_1.0_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y } ; GCN-LABEL: {{^}}mul_inline_imm_neg_1.0_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00bc00 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0xbc00 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0xbc00bc00, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0x00,0xbc] +; GFX10: v_pk_mul_lo_u16 v0, 0xffffbc00, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xbc,0xff,0xff] define <2 x i16> @mul_inline_imm_neg_1.0_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y @@ -635,10 +635,10 @@ define <2 x i16> @shl_inline_imm_neg_2.0_v2i16(<2 x i16> %x) { } ; GCN-LABEL: {{^}}mul_inline_imm_4.0_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004400 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x4400 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0x44004400, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x44] +; GFX10: v_pk_mul_lo_u16 v0, 0x4400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0x44,0x00,0x00] define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y @@ -646,20 +646,20 @@ define <2 x i16> @mul_inline_imm_4.0_v2i16(<2 x i16> %x) { } ; GCN-LABEL: {{^}}mul_inline_imm_neg_4.0_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xc400c400 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0xc400 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0xc400c400, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0x00,0xc4] +; GFX10: v_pk_mul_lo_u16 v0, 0xffffc400, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x00,0xc4,0xff,0xff] define <2 x i16> @mul_inline_imm_neg_4.0_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y } ; GCN-LABEL: {{^}}mul_inline_imm_inv2pi_v2i16: -; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x31183118 -; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] +; GFX9: s_movk_i32 [[K:s[0-9]+]], 0x3118 +; GFX9: v_pk_mul_lo_u16 v0, v0, [[K]] op_sel_hi:[1,0] -; GFX10: v_pk_mul_lo_u16 v0, 0x31183118, v0 ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x18,0x31] +; GFX10: v_pk_mul_lo_u16 v0, 0x3118, v0 op_sel_hi:[0,1] ; encoding: [0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0xff,0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x{{[0-9a-f]+}},0x18,0x31,0x00,0x00] define <2 x i16> @mul_inline_imm_inv2pi_v2i16(<2 x i16> %x) { %y = mul <2 x i16> %x, bitcast (<2 x half> to <2 x i16>) ret <2 x i16> %y diff --git a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll index 9ef246fe2e1015..725b2ca9579e80 100644 --- a/llvm/test/CodeGen/AMDGPU/inline-constraints.ll +++ b/llvm/test/CodeGen/AMDGPU/inline-constraints.ll @@ -99,7 +99,7 @@ define i32 @inline_A_constraint_H1() { ; VI-LABEL: {{^}}inline_A_constraint_H2: ; VI: v_mov_b32 {{v[0-9]+}}, 0x3c00 define i32 @inline_A_constraint_H2() { - %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half 1.0 to i16)) + %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(trunc i32 bitcast (float 1.0 to i32) to i16) ret i32 %v0 } @@ -107,7 +107,7 @@ define i32 @inline_A_constraint_H2() { ; VI-LABEL: {{^}}inline_A_constraint_H3: ; VI: v_mov_b32 {{v[0-9]+}}, 0xbc00 define i32 @inline_A_constraint_H3() { - %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half -1.0 to i16)) + %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (float -1.0 to i16)) ret i32 %v0 } @@ -115,7 +115,7 @@ define i32 @inline_A_constraint_H3() { ; VI-LABEL: {{^}}inline_A_constraint_H4: ; VI: v_mov_b32 {{v[0-9]+}}, 0x3118 define i32 @inline_A_constraint_H4() { - %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(half 0xH3118) + %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(float 0xH3118) ret i32 %v0 } @@ -123,7 +123,7 @@ define i32 @inline_A_constraint_H4() { ; VI-LABEL: {{^}}inline_A_constraint_H5: ; VI: v_mov_b32 {{v[0-9]+}}, 0x3118 define i32 @inline_A_constraint_H5() { - %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (half 0xH3118 to i16)) + %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(i16 bitcast (float 0xH3118 to i16)) ret i32 %v0 } @@ -131,7 +131,7 @@ define i32 @inline_A_constraint_H5() { ; VI-LABEL: {{^}}inline_A_constraint_H6: ; VI: v_mov_b32 {{v[0-9]+}}, 0xb800 define i32 @inline_A_constraint_H6() { - %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(half -0.5) + %v0 = tail call i32 asm "v_mov_b32 $0, $1", "=v,A"(float -0.5) ret i32 %v0 } diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll index 5de9b0b92c9a02..1a55bf608ebf51 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll @@ -3400,9 +3400,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_mov_b32 s2, 0xc400c400 +; GFX9-SDAG-NEXT: s_movk_i32 s2, 0xc400 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; @@ -3418,29 +3418,53 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_v2i16_x_add_neg_fpone: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, 0xc400c400, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_v2i16_x_add_neg_fpone: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v1, 0xc400c400, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0xc400c400, v1 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_fpone: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_pk_add_u16 v1, 0xffffc400, v1 op_sel_hi:[0,1] +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_fpone: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0xc400c400, v1 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext @@ -3541,9 +3565,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-SDAG-NEXT: global_load_dword v1, v0, s[2:3] -; GFX9-SDAG-NEXT: s_mov_b32 s2, 0x44004400 +; GFX9-SDAG-NEXT: s_movk_i32 s2, 0x4400 ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) -; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 +; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, s2 op_sel_hi:[1,0] ; GFX9-SDAG-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-SDAG-NEXT: s_endpgm ; @@ -3559,29 +3583,53 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out ; GFX9-GISEL-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-GISEL-NEXT: s_endpgm ; -; GFX10-LABEL: v_test_v2i16_x_add_neg_negfpone: -; GFX10: ; %bb.0: -; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v1, v0, s[2:3] -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_pk_add_u16 v1, 0x44004400, v1 -; GFX10-NEXT: global_store_dword v0, v1, s[0:1] -; GFX10-NEXT: s_endpgm +; GFX10-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-SDAG-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX10-SDAG-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1] +; GFX10-SDAG-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-SDAG-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_v2i16_x_add_neg_negfpone: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_pk_add_u16 v1, 0x44004400, v1 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_nop 0 -; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) -; GFX11-NEXT: s_endpgm +; GFX10-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 +; GFX10-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-GISEL-NEXT: global_load_dword v1, v0, s[2:3] +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX10-GISEL-NEXT: v_pk_add_u16 v1, 0x44004400, v1 +; GFX10-GISEL-NEXT: global_store_dword v0, v1, s[0:1] +; GFX10-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: v_test_v2i16_x_add_neg_negfpone: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) +; GFX11-SDAG-NEXT: v_pk_add_u16 v1, 0x4400, v1 op_sel_hi:[0,1] +; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-SDAG-NEXT: s_nop 0 +; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-SDAG-NEXT: s_endpgm +; +; GFX11-GISEL-LABEL: v_test_v2i16_x_add_neg_negfpone: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX11-GISEL-NEXT: v_pk_add_u16 v1, 0x44004400, v1 +; GFX11-GISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-GISEL-NEXT: s_nop 0 +; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GFX11-GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 %gep = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in, i64 %tid.ext