Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions clang/include/clang/Basic/BuiltinsAMDGPU.def
Original file line number Diff line number Diff line change
Expand Up @@ -673,6 +673,7 @@ TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_log_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_exp2_bf16, "yy", "nc", "bf16-trans-insts")

TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
Expand Down
1 change: 1 addition & 0 deletions clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_log_bf16:
return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log);
case AMDGPU::BI__builtin_amdgcn_exp2f:
case AMDGPU::BI__builtin_amdgcn_exp2_bf16:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
Intrinsic::amdgcn_exp2);
case AMDGPU::BI__builtin_amdgcn_log_clampf:
Expand Down
19 changes: 19 additions & 0 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,25 @@ void test_log_bf16(global __bf16* out, __bf16 a)
*out = __builtin_amdgcn_log_bf16(a);
}

// CHECK-LABEL: @test_exp2_bf16(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.exp2.bf16(bfloat [[TMP0]])
// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
// CHECK-NEXT: ret void
//
void test_exp2_bf16(global __bf16* out, __bf16 a)
{
*out = __builtin_amdgcn_exp2_bf16(a);
}

// CHECK-LABEL: @test_cvt_f16_fp8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,7 @@ defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>;
defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>;
defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>;
defm V_EXP_BF16 : VOP1Inst_t16 <"v_exp_bf16", VOP_BF16_BF16, AMDGPUexpf16>;
}
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
Expand Down Expand Up @@ -1145,6 +1146,7 @@ defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>;
defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>;
defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>;
defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>;
defm V_EXP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07d>;

//===----------------------------------------------------------------------===//
// GFX10.
Expand Down
23 changes: 23 additions & 0 deletions llvm/test/CodeGen/AMDGPU/bf16-math.ll
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,27 @@ define amdgpu_ps void @llvm_log2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
ret void
}

define amdgpu_ps void @llvm_exp2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
; GCN-LABEL: llvm_exp2_bf16_v:
; GCN: ; %bb.0:
; GCN-NEXT: v_exp_bf16_e32 v2, v2
; GCN-NEXT: global_store_b16 v[0:1], v2, off
; GCN-NEXT: s_endpgm
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's OK to directly select the generic intrinsic, why add the amdgcn one?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They were added in different times in the past and we do have a builtin for that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't follow

Copy link
Contributor Author

@shiltian shiltian Jul 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The target intrinsic int_amdgcn_exp2 was added a long time ago (along with the pattern AMDGPUexpf16). We didn't explicitly define a bf16 version of it. However, we did add a Clang builtin for that.

On the other hand, the generic intrinsic support was only added about a month ago.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because it is direct access to the instruction w/o following potential legalization.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's no legalization if the instruction works correctly. There's no reason to have the clang builtin, though the codegen for the target intrinsic should still work with the new type if legal

store bfloat %exp, ptr addrspace(1) %out, align 2
ret void
}

define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
; GCN-LABEL: llvm_exp2_bf16_s:
; GCN: ; %bb.0:
; GCN-NEXT: v_exp_bf16_e32 v2, s0
; GCN-NEXT: global_store_b16 v[0:1], v2, off
; GCN-NEXT: s_endpgm
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
store bfloat %exp, ptr addrspace(1) %out, align 2
ret void
}

declare bfloat @llvm.log2.bf16(bfloat)
declare bfloat @llvm.exp2.bf16(bfloat)
33 changes: 33 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.bf16.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s

; FIXME: GlobalISel does not work with bf16

declare bfloat @llvm.amdgcn.exp2.bf16(bfloat) #0

; GCN-LABEL: {{^}}exp_bf16:
; GCN: v_exp_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}}
define amdgpu_kernel void @exp_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
%exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat %src) #0
store bfloat %exp, ptr addrspace(1) %out, align 2
ret void
}

; GCN-LABEL: {{^}}exp_bf16_constant_4
; GCN: v_exp_bf16_e32 v0, 4.0
define amdgpu_kernel void @exp_bf16_constant_4(ptr addrspace(1) %out) #1 {
%exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat 4.0) #0
store bfloat %exp, ptr addrspace(1) %out, align 2
ret void
}

; GCN-LABEL: {{^}}exp_bf16_constant_100
; GCN: v_exp_bf16_e32 {{v[0-9]+}}, 0x42c8
define amdgpu_kernel void @exp_bf16_constant_100(ptr addrspace(1) %out) #1 {
%exp = call bfloat @llvm.amdgcn.exp2.bf16(bfloat 100.0) #0
store bfloat %exp, ptr addrspace(1) %out, align 2
ret void
}

attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
Loading
Loading