Skip to content

Commit

Permalink
[RFC][WIP][AMDGPU] Use bf16 instead of i16 for bfloat
Browse files Browse the repository at this point in the history
Currently it looks like we generally use `i16` to represent `bf16` in those tablegen
files. I'm not sure of the reason behind it. My wild guess is the type `bf16` was
not available when we enabled the support. This patch is trying to use `bf16`
directly in those tablegen files, aiming at fixing llvm#79369. Of course for llvm#79369
a workaround can be to treat all `INT16` variants as `BFloat` in `getOpFltSemantics`,
but it doesn't look good IMHO.

Since I'm fairly new to AMDGPU backend, I'd appreciate it if you can point out
where I don't understand correctly.
  • Loading branch information
shiltian committed Feb 8, 2024
1 parent 92eaf03 commit 672fd3c
Show file tree
Hide file tree
Showing 16 changed files with 202 additions and 67 deletions.
4 changes: 0 additions & 4 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5908,8 +5908,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
}
}

assert(ArgValue->getType()->canLosslesslyBitCastTo(PTy) &&
"Must be able to losslessly bit cast to param");
// Cast vector type (e.g., v256i32) to x86_amx, this only happen
// in amx intrinsics.
if (PTy->isX86_AMXTy())
Expand Down Expand Up @@ -5939,8 +5937,6 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
}
}

assert(V->getType()->canLosslesslyBitCastTo(RetTy) &&
"Must be able to losslessly bit cast result type");
// Cast x86_amx to vector type (e.g., v256i32), this only happen
// in amx intrinsics.
if (V->getType()->isX86_AMXTy())
Expand Down
12 changes: 6 additions & 6 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2819,11 +2819,11 @@ def int_amdgcn_fdot2_f16_f16 :
def int_amdgcn_fdot2_bf16_bf16 :
ClangBuiltin<"__builtin_amdgcn_fdot2_bf16_bf16">,
DefaultAttrsIntrinsic<
[llvm_i16_ty], // %r
[llvm_bfloat_ty], // %r
[
llvm_v2i16_ty, // %a
llvm_v2i16_ty, // %b
llvm_i16_ty // %c
llvm_v2bf16_ty, // %a
llvm_v2bf16_ty, // %b
llvm_bfloat_ty // %c
],
[IntrNoMem, IntrSpeculatable]
>;
Expand All @@ -2835,8 +2835,8 @@ def int_amdgcn_fdot2_f32_bf16 :
DefaultAttrsIntrinsic<
[llvm_float_ty], // %r
[
llvm_v2i16_ty, // %a
llvm_v2i16_ty, // %b
llvm_v2bf16_ty, // %a
llvm_v2bf16_ty, // %b
llvm_float_ty, // %c
llvm_i1_ty // %clamp
],
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1562,8 +1562,9 @@ bool IRTranslator::translateBitCast(const User &U,

bool IRTranslator::translateCast(unsigned Opcode, const User &U,
MachineIRBuilder &MIRBuilder) {
if (U.getType()->getScalarType()->isBFloatTy() ||
U.getOperand(0)->getType()->getScalarType()->isBFloatTy())
if (Opcode != TargetOpcode::G_BITCAST &&
(U.getType()->getScalarType()->isBFloatTy() ||
U.getOperand(0)->getType()->getScalarType()->isBFloatTy()))
return false;
Register Op = getOrCreateVReg(*U.getOperand(0));
Register Res = getOrCreateVReg(U);
Expand Down
66 changes: 66 additions & 0 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -474,6 +474,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {

bool isSSrcF64() const { return isSCSrc_b64() || isLiteralImm(MVT::f64); }

bool isSSrc_bf16() const { return isSCSrcB16() || isLiteralImm(MVT::bf16); }

bool isSSrc_f16() const { return isSCSrcB16() || isLiteralImm(MVT::f16); }

bool isSSrcV2F16() const {
Expand Down Expand Up @@ -540,22 +542,40 @@ class AMDGPUOperand : public MCParsedAsmOperand {
return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64);
}

bool isVCSrcTBF16() const {
return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::bf16);
}

bool isVCSrcTF16() const {
return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::f16);
}

bool isVCSrcTBF16_Lo128() const {
return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::bf16);
}

bool isVCSrcTF16_Lo128() const {
return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::f16);
}

bool isVCSrcFake16BF16_Lo128() const {
return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT::bf16);
}

bool isVCSrcFake16F16_Lo128() const {
return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT::f16);
}

bool isVCSrc_bf16() const {
return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::bf16);
}

bool isVCSrc_f16() const {
return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16);
}

bool isVCSrc_v2bf16() const { return isVCSrc_bf16(); }

bool isVCSrc_v2f16() const { return isVCSrc_f16(); }

bool isVSrc_b32() const {
Expand Down Expand Up @@ -596,18 +616,34 @@ class AMDGPUOperand : public MCParsedAsmOperand {

bool isVSrc_f64() const { return isVCSrcF64() || isLiteralImm(MVT::f64); }

bool isVSrcT_bf16() const { return isVCSrcTBF16() || isLiteralImm(MVT::bf16); }

bool isVSrcT_f16() const { return isVCSrcTF16() || isLiteralImm(MVT::f16); }

bool isVSrcT_bf16_Lo128() const {
return isVCSrcTBF16_Lo128() || isLiteralImm(MVT::bf16);
}

bool isVSrcT_f16_Lo128() const {
return isVCSrcTF16_Lo128() || isLiteralImm(MVT::f16);
}

bool isVSrcFake16_bf16_Lo128() const {
return isVCSrcFake16BF16_Lo128() || isLiteralImm(MVT::bf16);
}

bool isVSrcFake16_f16_Lo128() const {
return isVCSrcFake16F16_Lo128() || isLiteralImm(MVT::f16);
}

bool isVSrc_bf16() const { return isVCSrc_bf16() || isLiteralImm(MVT::bf16); }

bool isVSrc_f16() const { return isVCSrc_f16() || isLiteralImm(MVT::f16); }

bool isVSrc_v2bf16() const {
return isVSrc_bf16() || isLiteralImm(MVT::v2bf16);
}

bool isVSrc_v2f16() const { return isVSrc_f16() || isLiteralImm(MVT::v2f16); }

bool isVISrcB32() const {
Expand All @@ -634,6 +670,10 @@ class AMDGPUOperand : public MCParsedAsmOperand {
return isVISrcF16() || isVISrcB32();
}

bool isVISrc_64_bf16() const {
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::bf16);
}

bool isVISrc_64_f16() const {
return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f16);
}
Expand Down Expand Up @@ -802,6 +842,10 @@ class AMDGPUOperand : public MCParsedAsmOperand {
return isAISrc_128F16() || isAISrc_128_b32();
}

bool isVISrc_128_bf16() const {
return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::bf16);
}

bool isVISrc_128_f16() const {
return isRegOrInlineNoMods(AMDGPU::VReg_128RegClassID, MVT::f16);
}
Expand Down Expand Up @@ -1889,6 +1933,14 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_KIMM16:
return &APFloat::IEEEhalf();
case AMDGPU::OPERAND_REG_IMM_BF16:
case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED:
case AMDGPU::OPERAND_REG_INLINE_C_BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_AC_BF16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
return &APFloat::BFloat();
default:
llvm_unreachable("unsupported fp type");
}
Expand Down Expand Up @@ -2185,17 +2237,24 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
case AMDGPU::OPERAND_REG_IMM_INT16:
case AMDGPU::OPERAND_REG_IMM_BF16:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED:
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_BF16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_BF16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
case AMDGPU::OPERAND_REG_IMM_V2FP32:
Expand Down Expand Up @@ -2239,6 +2298,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
Expand Down Expand Up @@ -2276,11 +2336,15 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
return;

case AMDGPU::OPERAND_REG_IMM_INT16:
case AMDGPU::OPERAND_REG_IMM_BF16:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED:
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_C_BF16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_BF16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
if (isSafeTruncation(Val, 16) &&
AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
Expand All @@ -2295,8 +2359,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
return;

case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
assert(isSafeTruncation(Val, 16));
assert(AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
Expand Down
10 changes: 10 additions & 0 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,11 @@ void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm, uint8_t OpType,
if (printImmediateFloat32(Imm, STI, O))
return;
break;
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
if (isUInt<16>(Imm) &&
printImmediateFloat16(static_cast<uint16_t>(Imm), STI, O))
Expand Down Expand Up @@ -792,17 +795,24 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::OPERAND_REG_IMM_INT16:
printImmediateInt16(Op.getImm(), STI, O);
break;
case AMDGPU::OPERAND_REG_INLINE_C_BF16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_BF16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
case AMDGPU::OPERAND_REG_IMM_BF16:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED:
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
printImmediate16(Op.getImm(), STI, O);
break;
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
printImmediateV216(Op.getImm(), OpTy, STI, O);
break;
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,13 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
case AMDGPU::OPERAND_REG_IMM_BF16:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED:
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
case AMDGPU::OPERAND_REG_INLINE_C_BF16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_BF16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
// FIXME Is this correct? What do inline immediates do on SI for f16 src
// which does not have f16 support?
Expand All @@ -288,8 +292,11 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
return AMDGPU::getInlineEncodingV2I16(static_cast<uint32_t>(Imm))
.value_or(255);
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
return AMDGPU::getInlineEncodingV2F16(static_cast<uint32_t>(Imm))
.value_or(255);
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/SIDefines.h
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,12 @@ enum OperandType : unsigned {
OPERAND_REG_IMM_INT16,
OPERAND_REG_IMM_FP32,
OPERAND_REG_IMM_FP64,
OPERAND_REG_IMM_BF16,
OPERAND_REG_IMM_FP16,
OPERAND_REG_IMM_BF16_DEFERRED,
OPERAND_REG_IMM_FP16_DEFERRED,
OPERAND_REG_IMM_FP32_DEFERRED,
OPERAND_REG_IMM_V2BF16,
OPERAND_REG_IMM_V2FP16,
OPERAND_REG_IMM_V2INT16,
OPERAND_REG_IMM_V2INT32,
Expand All @@ -208,10 +211,12 @@ enum OperandType : unsigned {
OPERAND_REG_INLINE_C_INT16,
OPERAND_REG_INLINE_C_INT32,
OPERAND_REG_INLINE_C_INT64,
OPERAND_REG_INLINE_C_BF16,
OPERAND_REG_INLINE_C_FP16,
OPERAND_REG_INLINE_C_FP32,
OPERAND_REG_INLINE_C_FP64,
OPERAND_REG_INLINE_C_V2INT16,
OPERAND_REG_INLINE_C_V2BF16,
OPERAND_REG_INLINE_C_V2FP16,
OPERAND_REG_INLINE_C_V2INT32,
OPERAND_REG_INLINE_C_V2FP32,
Expand All @@ -226,10 +231,12 @@ enum OperandType : unsigned {
/// Operands with an AccVGPR register or inline constant
OPERAND_REG_INLINE_AC_INT16,
OPERAND_REG_INLINE_AC_INT32,
OPERAND_REG_INLINE_AC_BF16,
OPERAND_REG_INLINE_AC_FP16,
OPERAND_REG_INLINE_AC_FP32,
OPERAND_REG_INLINE_AC_FP64,
OPERAND_REG_INLINE_AC_V2INT16,
OPERAND_REG_INLINE_AC_V2BF16,
OPERAND_REG_INLINE_AC_V2FP16,
OPERAND_REG_INLINE_AC_V2INT32,
OPERAND_REG_INLINE_AC_V2FP32,
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4181,13 +4181,20 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
return AMDGPU::isInlinableLiteralV2I16(Imm);
case AMDGPU::OPERAND_REG_IMM_V2BF16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2BF16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
return AMDGPU::isInlinableLiteralV2F16(Imm);
case AMDGPU::OPERAND_REG_IMM_BF16:
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_BF16_DEFERRED:
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
case AMDGPU::OPERAND_REG_INLINE_C_BF16:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_BF16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
// A few special case instructions have 16-bit operands on subtargets
Expand Down
Loading

0 comments on commit 672fd3c

Please sign in to comment.