[AMDGPU] Global and Buffer loads to LDS should not increase lgkmcnt#179305
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Alexander Weinrauch (AlexAUT) Changes
Note that the change for buffer ops is not necesssary, i.e. the lit test passes even before this PR, because it seems like Full diff: https://github.com/llvm/llvm-project/pull/179305.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index ca8099736c187..181374fb7b0b8 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -509,7 +509,7 @@ class MUBUF_Load_Pseudo <string opName,
let AsmMatchConverter = "cvtMubuf";
let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", "");
- let LGKM_CNT = isLds;
+ let LGKM_CNT = 0;
let has_vdata = !not(!or(isLds, isLdsOpc));
let mayLoad = 1;
let mayStore = isLds;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 36d54309181ae..3ad15ae28e51e 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -398,7 +398,7 @@ class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsy
!if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)),
(ins flat_offset:$offset, CPol_0:$cpol)),
!if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
- let LGKM_CNT = !not(IsAsync);
+ let LGKM_CNT = 0;
let VM_CNT = !not(IsAsync);
let ASYNC_CNT = IsAsync;
let is_flat_global = 1;
diff --git a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
index 7e9f21b94bea0..a526b1ec16502 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-crash-valu-hazard.ll
@@ -54,7 +54,6 @@ define amdgpu_ps void @global_load_lds_dword_saddr(ptr addrspace(1) inreg nocapt
; GFX90A-NEXT: s_mov_b32 m0, s4
; GFX90A-NEXT: s_nop 0
; GFX90A-NEXT: global_load_dword v0, s[2:3] offset:32 slc lds
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: global_store_dwordx2 v0, v[2:3], s[0:1]
; GFX90A-NEXT: s_endpgm
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
index 21372c06d3223..0e64d0430668e 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
+++ b/llvm/test/CodeGen/AMDGPU/lds-dma-waitcnt.mir
@@ -67,6 +67,45 @@ body: |
...
+# Test that GLOBAL_LOAD_LDS does not increment lgkmcnt (LGKM_CNT = 0).
+# GCN-LABEL: name: ds_read_global_load_lds_use_ds_data
+# GCN: DS_READ_B32_gfx9
+# GCN-NEXT: GLOBAL_LOAD_LDS_DWORD
+# GCN-NEXT: S_WAITCNT 49279
+# lgkmcnt(0)
+# GCN-NEXT: V_ADD_U32_e32
+---
+name: ds_read_global_load_lds_use_ds_data
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ GLOBAL_LOAD_LDS_DWORD $vgpr2_vgpr3, 4, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison` + 4), (store (s32) into `ptr addrspace(3) poison` + 4)
+ $vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# Test that BUFFER_LOAD_DWORD_LDS does not increment lgkmcnt.
+# DS_READ increments lgkmcnt. When using the DS_READ result, we wait for lgkmcnt(0).
+# GCN-LABEL: name: ds_read_buffer_load_lds_use_ds_data
+# GCN: DS_READ_B32_gfx9
+# GCN-NEXT: BUFFER_LOAD_DWORD_LDS_IDXEN
+# GCN-NEXT: S_WAITCNT 49279
+# lgkmcnt(0)
+# GCN-NEXT: V_ADD_U32_e32
+---
+name: ds_read_buffer_load_lds_use_ds_data
+body: |
+ bb.0:
+ $m0 = S_MOV_B32 0
+ $vgpr0 = DS_READ_B32_gfx9 $vgpr1, 0, 0, implicit $m0, implicit $exec :: (load (s32) from `ptr addrspace(3) poison`)
+ BUFFER_LOAD_DWORD_LDS_IDXEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, implicit $exec, implicit $m0 :: (load (s32) from `ptr addrspace(1) poison`), (store (s32) into `ptr addrspace(3) poison`)
+ $vgpr4 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
+ S_ENDPGM 0
+
+...
+
# GCN-LABEL: name: scratch_load_lds_dword_ds_read
# GCN: SCRATCH_LOAD_LDS_DWORD
# GCN-NEXT: S_WAITCNT 3952
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index 1c986a02e8bd6..fba7720b37bf6 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -327,7 +327,7 @@ define void @global_load_lds_dword_saddr_and_vaddr(ptr addrspace(1) nocapture in
; GFX942-NEXT: s_mov_b32 m0, s2
; GFX942-NEXT: s_nop 0
; GFX942-NEXT: global_load_lds_dword v1, s[0:1] offset:48 sc1
-; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_waitcnt vmcnt(0)
; GFX942-NEXT: s_setpc_b64 s[30:31]
main_body:
%voffset.64 = zext i32 %voffset to i64
|
|
Can you rephrase the title? They certainly do for DS loads to LDS. This is a more niche case |
Technically DS loads would be loads from LDS. I'm not sure what to call these. "Load to LDS" is OK but a bit subtle. "LDS DMA load" terminology is no longer used in the hardware docs. "Load direct-to-LDS" is likely to be confused with the GFX11 |
lgkmcntlgkmcnt
|
Maybe explicitly stating that this is for Global and Buffer loads to LDS is better? |
| let AsmMatchConverter = "cvtMubuf"; | ||
|
|
||
| let Constraints = !if(HasTiedDest, "$vdata = $vdata_in", ""); | ||
| let LGKM_CNT = isLds; |
There was a problem hiding this comment.
Just for the record, it looks like this came from https://reviews.llvm.org/D124472.
| !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64_AlignTarget:$vaddr)), | ||
| (ins flat_offset:$offset, CPol_0:$cpol)), | ||
| !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> { | ||
| let LGKM_CNT = !not(IsAsync); |
There was a problem hiding this comment.
And this came from #151030 and previously from https://reviews.llvm.org/D121414.
jayfoad
left a comment
There was a problem hiding this comment.
LGTM. I can't find definitive documentation on this but intuitively it makes sense that these load-to-LDS opcodes do not use lgkmcnt.
Pulls in llvm/llvm-project#179305 to fix a synchronization issue with `AsyncCopy` on `GFX9`
…llvm#179305) `global_load_lds` and `buffer_load to lds` do only increment `vmcnt` and not touch `lgkmcnt`. This causes invalid `waitcnts` for some Triton kernels, similar to the added lit tests. Note that the change for buffer ops is not necesssary, i.e. the lit test passes even before this PR, because it seems like `SIInsertWaitcnts` does not use `LGKM_CNT` for buffer ops. But this change might prevent a bug in the future.
This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083
This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083 (cherry picked from commit 552306f)
LLVM bump cherry pick for rel/3.7, this improve stability across ROCm archs. This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083 (cherry picked from commit 552306f) <!--- The core Triton is a small number of people, and we receive many PRs (thank you!). To help us review your code more quickly, **if you are a new contributor (less than 3 PRs merged) we ask that you complete the following tasks and include the filled-out checklist in your PR description.** Complete the following tasks before sending your PR, and replace `[ ]` with `[x]` to indicate you have done them. --> # New contributor declaration - [ ] I am not making a trivial change, such as fixing a typo in a comment. - [ ] I have written a PR description following these [rules](https://cbea.ms/git-commit/#why-not-how). - [ ] I have run `pre-commit run --from-ref origin/main --to-ref HEAD`. - Select one of the following. - [ ] I have added tests. - `/test` for `lit` tests - `/unittest` for C++ tests - `/python/test` for end-to-end tests - [ ] This PR does not need a test because `FILL THIS IN`. - Select one of the following. - [ ] I have not added any `lit` tests. - [ ] The `lit` tests I have added follow these [best practices](https://mlir.llvm.org/getting_started/TestingGuide/#filecheck-best-practices), including the "tests should be minimal" section. (Usually running Python code and using the instructions it generates is not minimal.) Co-authored-by: Alexander Weinrauch <alexander.weinrauch@amd.com>
This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083
This pulls in: - llvm/llvm-project#179305 - llvm/llvm-project#180220 - llvm/llvm-project#180268 `NoInfsFPMath` got removed in llvm/llvm-project#180083
global_load_ldsandbuffer_load to ldsdo only incrementvmcntand not touchlgkmcnt. This causes invalidwaitcntsfor some Triton kernels, similar to the added lit tests.Note that the change for buffer ops is not necesssary, i.e. the lit test passes even before this PR, because it seems like
SIInsertWaitcntsdoes not useLGKM_CNTfor buffer ops. But this change might prevent a bug in the future.