Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@ enum { MAX_LANES = 64 };

using namespace llvm;

// TODO -- delete this flag once we have more robust mechanisms to allocate the
// optimal RC for Opc and Dest of MFMA. In particular, there are high RP cases
// where it is better to produce the VGPR form (e.g. if there are VGPR users
// of the MFMA result).
cl::opt<bool> MFMAVGPRForm(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing static

"amdgpu-mfma-vgpr-form", cl::Hidden,
cl::desc("Whether to force use VGPR for Opc and Dest of MFMA. If "
"unspecified, default to compiler heuristics"),
cl::init(false));

const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
const SITargetLowering *TLI = STI->getTargetLowering();
return static_cast<const GCNTargetMachine &>(TLI->getTargetMachine());
Expand Down Expand Up @@ -69,8 +79,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
}

MayNeedAGPRs = ST.hasMAIInsts();
if (ST.hasGFX90AInsts() &&
MayNeedAGPRs = ST.hasMAIInsts() && !MFMAVGPRForm;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This breaks if used with gfx908, so it's impossible to flip the default

if (!MFMAVGPRForm && ST.hasGFX90AInsts() &&
ST.getMaxNumVGPRs(F) <= AMDGPU::VGPR_32RegClass.getNumRegs() &&
!mayUseAGPRs(F))
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
Expand Down
76 changes: 76 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.form.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=0 < %s | FileCheck -enable-var-scope --check-prefixes=HEURRC %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx950 --amdgpu-mfma-vgpr-form=1 < %s | FileCheck -enable-var-scope --check-prefixes=VGPRRC %s

declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg)

define <4 x float> @default(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) {
; HEURRC-LABEL: default:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
; HEURRC-NEXT: s_setpc_b64 s[30:31]
;
; VGPRRC-LABEL: default:
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
ret <4 x float> %result
}

define <4 x float> @request_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #0 {
; HEURRC-LABEL: request_agpr:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HEURRC-NEXT: v_accvgpr_write_b32 a0, v8
; HEURRC-NEXT: v_accvgpr_write_b32 a1, v9
; HEURRC-NEXT: v_accvgpr_write_b32 a2, v10
; HEURRC-NEXT: v_accvgpr_write_b32 a3, v11
; HEURRC-NEXT: s_nop 1
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 a[0:3], v[0:3], v[4:7], a[0:3]
; HEURRC-NEXT: s_nop 7
; HEURRC-NEXT: v_accvgpr_read_b32 v0, a0
; HEURRC-NEXT: v_accvgpr_read_b32 v1, a1
; HEURRC-NEXT: v_accvgpr_read_b32 v2, a2
; HEURRC-NEXT: v_accvgpr_read_b32 v3, a3
; HEURRC-NEXT: s_setpc_b64 s[30:31]
;
; VGPRRC-LABEL: request_agpr:
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
ret <4 x float> %result
}

define <4 x float> @request_no_agpr(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2) #1 {
; HEURRC-LABEL: request_no_agpr:
; HEURRC: ; %bb.0:
; HEURRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; HEURRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; HEURRC-NEXT: s_setpc_b64 s[30:31]
;
; VGPRRC-LABEL: request_no_agpr:
; VGPRRC: ; %bb.0:
; VGPRRC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VGPRRC-NEXT: v_mfma_f32_16x16x32_f16 v[0:3], v[0:3], v[4:7], v[8:11]
; VGPRRC-NEXT: s_setpc_b64 s[30:31]
%result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half> %arg0, <8 x half> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0)
ret <4 x float> %result
}

attributes #0 = { "amdgpu-agpr-alloc"="32,256" }
attributes #1 = { "amdgpu-agpr-alloc"="0,0" }
Loading
Loading