Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
7a67b2b
Preserve MUBUF scope modifiers through StinkyTofu lowering
jaopaulolc Apr 26, 2026
089ba46
Format fix
jaopaulolc Apr 27, 2026
9e0b014
Merge branch 'develop' into users/nhenders/stinky-SCOPE
jaopaulolc Apr 27, 2026
e79e255
Use enun rather than std::string for MUBUF modifiers
jaopaulolc Apr 27, 2026
a2246be
Merge branch 'develop' into users/nhenders/stinky-SCOPE
jaopaulolc Apr 27, 2026
e54cc2a
Merge branch 'develop' into users/nhenders/stinky-SCOPE
jaopaulolc Apr 28, 2026
d574f98
Add offset flags if not set or no mubuf modifiers
jaopaulolc Apr 28, 2026
4d20e35
Merge branch 'develop' into users/nhenders/stinky-SCOPE
jaopaulolc Apr 28, 2026
58c54c2
Fix MUBUF null soffset emission
ThanHenderson Apr 29, 2026
676fad7
Fix format
ThanHenderson Apr 29, 2026
aa9b63c
Remove redundant MUBUF scope flag
ThanHenderson Apr 30, 2026
53bf38b
Fix format
ThanHenderson Apr 30, 2026
aae24e6
KernelWriter: fix missing s_wait_tensorcnt before PGR barrier in TDM …
ThanHenderson Apr 24, 2026
7facbcc
StreamK: fix gfx1250 flag and partial ordering
ThanHenderson Apr 26, 2026
064fe03
Remove CrossLaneWait for 1250
ThanHenderson Apr 26, 2026
a8eaba8
Set Scope flags via MUBUFModifiers instead of inline asm
jaopaulolc Apr 26, 2026
a8a265d
StreamK: use 1LDSBuffer for MX quick tests
jaopaulolc Apr 26, 2026
2883bbf
Re-enable SK tests on GFX1250
jaopaulolc Apr 27, 2026
5f12b69
Fix gfx1250 MX SubIter LDS waits
jaopaulolc Apr 27, 2026
e3d54d9
Refactor StreamK gfx1250 fences into capability-driven Component
ThanHenderson Apr 27, 2026
46e8c88
Fix comment
ThanHenderson Apr 27, 2026
ee63bc7
Merge branch 'develop' into users/nhenders/fix-streamk-1250-fr-fr
ThanHenderson Apr 30, 2026
c586dfb
Merge branch 'develop' into users/nhenders/fix-streamk-1250-fr-fr
ThanHenderson May 1, 2026
c535fcd
Merge branch 'develop' into users/nhenders/fix-streamk-1250-fr-fr
ThanHenderson May 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
186 changes: 176 additions & 10 deletions projects/hipblaslt/tensilelite/Tensile/Components/StreamK.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@
from rocisa.code import Module, Label
from rocisa.container import vgpr, sgpr, SMEMModifiers, MUBUFModifiers, replaceHolder, EXEC,\
VOP3PModifiers, ContinuousRegister
from rocisa.instruction import SAddCU32, SAddU32, SAndB32, SBarrier, \
from rocisa.instruction import MacroInstruction, SAddCU32, SAddU32, SAndB32, SBarrier, \
SBranch, SCBranchSCC0, SCBranchSCC1, SCMovB32, SCSelectB32, SCmpEQU32, SCmpEQU64, \
SCmpGtU32, SCmpLeU32, SCmpLtU32, SLShiftLeftB32, SLShiftLeftB64, SLShiftRightB32, SLoadB32, \
SMaxI32, SMinU32, SMovB32, SMovB64, SMulI32, SNop, SOrB32, SSleep, SStoreB32, SSubU32, \
SWaitCnt, VAddF32, VAddF64, VAddPKF16, VAddU32, VLShiftRightB32, VMovB32, \
VReadfirstlaneB32, VCvtBF16toFP32, BufferStoreB32
VReadfirstlaneB32, VCvtBF16toFP32, BufferLoadB32, BufferStoreB32
from rocisa.functions import scalarStaticDivideAndRemainder, sMagicDiv2, \
vectorStaticMultiply, BranchIfNotZero, scalarUInt24DivideAndRemainder, scalarUInt32DivideAndRemainder

Expand Down Expand Up @@ -107,6 +107,143 @@ def __call__(self, writer, kernel):
return module


class StreamKMemoryOrdering(Component):
Comment thread
ThanHenderson marked this conversation as resolved.
"""
Memory-ordering fences and flag accessors for the StreamK partial-tile
handshake.

StreamK uses a producer/consumer protocol: one workgroup writes a partial
tile to a workspace and signals completion via a flag, and other
workgroups poll the flag and read the partials. The required cross-CU
memory ordering depends on the target ISA:

- Most arches: `s_waitcnt vscnt(0)` before the flag store and an SMEM
flag load with `glc/dlc/scope:SCOPE_DEV` are sufficient because
ordering between L1/L2 and the device-scope coherence point is
implicit.

- gfx1250: the L2 has independent partitions and SMEM is not coherent
with the VMEM flag store, so an explicit `global_wb scope:SCOPE_DEV`
is required on the release side and a `global_inv scope:SCOPE_DEV`
on the acquire side. Additionally, XNACK-replay can reorder a
volatile/atomic VMEM op past in-flight VMEM, so `s_wait_xcnt 0` must
precede such ops. The flag itself must be read via VMEM (not SMEM)
to observe the producer's release-side fence.

Selection is driven by the `HasInvWbDevFences` arch capability. The
XNACK-replay drain in `preVolatileVmem` is gated separately on
`RequiresXCntForVolatileVMEM` and lives on the abstract base so a
future arch needing only one of the two can be supported by adding a
single capability flag.
"""
def __call__(self):
assert(0)

def preVolatileVmem(self, writer, comment="") -> Module:
"""Drain in-flight VMEM (XNACK-replay) before a volatile/atomic VMEM op.

Required on arches with `RequiresXCntForVolatileVMEM`. No-op
elsewhere.
"""
module = Module("StreamK pre-volatile VMEM drain")
if writer.states.archCaps["RequiresXCntForVolatileVMEM"]:
module.add(MacroInstruction(name="s_wait_xcnt 0", args=[], comment=comment))
return module

@abc.abstractmethod
def releaseFence(self, writer) -> Module:
"""Memory fence ordering prior partial-tile stores before the flag store."""
pass

@abc.abstractmethod
def acquireFence(self, writer) -> Module:
"""Memory fence after observing the flag and before reading partials."""
pass

@abc.abstractmethod
def readFlag(self, writer, dst, soffset) -> Module:
"""Read the StreamK completion flag into SGPR `dst` for compare."""
pass

@abc.abstractmethod
def flagBufferMubuf(self) -> MUBUFModifiers:
"""MUBUF modifiers for buffer load/store of the flag word."""
pass


class StreamKMemoryOrderingDefault(StreamKMemoryOrdering):
"""No-op cross-CU fences; SMEM flag with glc/dlc/SCOPE_DEV.

Used on every arch that does not require explicit cross-L2 fences.
"""
archCaps = {"HasInvWbDevFences": False}

def releaseFence(self, writer) -> Module:
module = Module("StreamK release fence (default)")
module.add(SWaitCnt(vscnt=0, comment="wait for data store"))
return module

def acquireFence(self, writer) -> Module:
return Module("StreamK acquire fence (default, no-op)")

def readFlag(self, writer, dst, soffset) -> Module:
module = Module("StreamK read flag (SMEM)")
module.add(SLoadB32(dst=sgpr(dst), base=sgpr("AddressFlags", 2),
soffset=soffset,
smem=SMEMModifiers(glc=True, dlc=True,
scope=CacheScope.SCOPE_DEV),
comment="get flag"))
module.add(SWaitCnt(kmcnt=0, comment="wait for flag load"))
return module

def flagBufferMubuf(self) -> MUBUFModifiers:
return MUBUFModifiers(offen=True, glc=True, dlc=True,
scope=CacheScope.SCOPE_DEV)


class StreamKMemoryOrderingDevScopeFences(StreamKMemoryOrdering):
"""Explicit cross-L2 release/acquire fences via global_wb/global_inv
scope:SCOPE_DEV plus a VMEM-coherent flag read.

Selected on arches whose L2 is partitioned across CUs/XCDs and whose
SMEM is not coherent with the VMEM flag write (e.g. gfx1250).
"""
archCaps = {"HasInvWbDevFences": True}

def releaseFence(self, writer) -> Module:
module = Module("StreamK release fence (dev-scope)")
module.add(SWaitCnt(vlcnt=0,
comment="release: drain in-flight loads before global_wb"))
module.add(SWaitCnt(vscnt=0, comment="wait for data store"))
module.add(MacroInstruction(name="global_wb scope:SCOPE_DEV", args=[],
comment="release: writeback partials to L2-coherent point"))
module.add(SWaitCnt(vlcnt=0, vscnt=0,
comment="release: wait for global_wb"))
return module

def acquireFence(self, writer) -> Module:
module = Module("StreamK acquire fence (dev-scope)")
module.add(MacroInstruction(name="global_inv scope:SCOPE_DEV", args=[],
comment="acquire: invalidate partials after flag"))
module.add(SWaitCnt(vlcnt=0, comment="acquire: wait for global_inv"))
return module

def readFlag(self, writer, dst, soffset) -> Module:
streamk = Component.StreamK.find(writer)
module = Module("StreamK read flag (VMEM)")
flagVgpr = writer.vgprPool.checkOut(1, "flagAcq")
module.add(streamk.getFlagValue(writer, dst=vgpr(flagVgpr),
soffset=soffset, comment="acquire: get flag (VMEM)"))
module.add(SWaitCnt(vlcnt=0, comment="acquire: wait VMEM flag load"))
module.add(VReadfirstlaneB32(dst=sgpr(dst), src=vgpr(flagVgpr),
comment="move VMEM flag to SGPR for compare"))
writer.vgprPool.checkIn(flagVgpr)
return module

def flagBufferMubuf(self) -> MUBUFModifiers:
return MUBUFModifiers(offen=True, scope=CacheScope.SCOPE_DEV)


class StreamK(Component):
"""
StreamK code.
Expand Down Expand Up @@ -472,6 +609,7 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele
if kernel["StreamKAtomic"]:
return module

memOrder = Component.StreamKMemoryOrdering.find(writer)
skConstsInVgprs = writer.isStreamKConstantsToVgprEnabled(kernel)
skStoreLabel = Label(label=writer.labels.getNameInc("SK_Store"), comment="")

Expand Down Expand Up @@ -593,11 +731,11 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele
module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sFlagIdx), shiftHex=log2(4), comment="flag offset based on wg index"))

module.add(skFixupWaitForFlag) # loop to wait for flag
module.add(SLoadB32(dst=sgpr(tmpSgpr+1), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag"))
module.add(SWaitCnt(kmcnt=0, comment="wait for flag load"))
module.add(memOrder.readFlag(writer, dst=tmpSgpr+1, soffset=sgpr(tmpSgpr)))
if kernel["DebugStreamK"] & 2 == 0: # Don't wait for partials if not being written
module.add(SCmpEQU32(src0=sgpr(tmpSgpr+1), src1=1, comment="check if ready"))
module.add(SCBranchSCC0(labelName=skFixupWaitForFlag.getLabelName(), comment="if flag not set, wait and check again"))
module.add(memOrder.acquireFence(writer))

module.add(SBarrier(comment="wait for all workgroups before resetting flag"))
skipFlagReset = Label(label=writer.labels.getNameInc("SK_SkipFlagReset"), comment="")
Expand Down Expand Up @@ -681,11 +819,11 @@ def storeBranchesCommon(self, writer, kernel, skPartialsLabel, vectorWidths, ele

# Check flag
module.add(SLShiftLeftB32(dst=sgpr(tmpSgpr), src=sgpr(sCtaIdx), shiftHex=log2(4), comment="flag offset based on CTA index"))
module.add(SLoadB32(dst=sgpr(tmpSgpr+2), base=sgpr("AddressFlags", 2), soffset=sgpr(tmpSgpr), smem=SMEMModifiers(glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), comment="get flag"))
module.add(SWaitCnt(kmcnt=0, comment="wait for flag load"))
module.add(memOrder.readFlag(writer, dst=tmpSgpr+2, soffset=sgpr(tmpSgpr)))
if kernel["DebugStreamK"] & 2 == 0:
module.add(SCmpEQU32(src0=sgpr(tmpSgpr+2), src1=1, comment="check if ready"))
module.add(SCBranchSCC0(labelName=skFixupLabel.getLabelName(), comment="if flag not set, wait and check again"))
module.add(memOrder.acquireFence(writer))

# TODO Barrier here to sync all threads in workgroup, but maybe better to have separate flag for each wavefront (to be tested)
module.add(SBarrier(comment="wait for all workgroups before resetting flag"))
Expand Down Expand Up @@ -804,6 +942,7 @@ def computeWorkspaceSrd(self, writer, kernel, sCtaIdx, tmpSgpr = None):

def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha, beta, edge, tmpVgpr, cvtVgprStruct, endLabel):
module = Module("StreamK Common partialsWriteProcedure")
memOrder = Component.StreamKMemoryOrdering.find(writer)

# PreLoopVmcntCaseStr = ""
# # not generate Case 2 if StoreCInUnroll with StoreVectorWidth==1 (Case 2 will be same as Case 3)
Expand Down Expand Up @@ -1009,7 +1148,7 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha,
# kStr += PreLoopVmcntCaseStr

# Set flag
module.add(SWaitCnt(vscnt=0, comment="wait for data store"))
module.add(memOrder.releaseFence(writer))
module.add(SBarrier(comment="store all data before setting flag"))
sIdx = writer.acquireStreamKConstSgpr(kernel, "StreamKIdx")
if writer.isStreamKConstantsToVgprEnabled(kernel):
Expand Down Expand Up @@ -1040,22 +1179,49 @@ def partialsWriteProcedure(self, writer, kernel, vectorWidths, elements, alpha,

def setFlagValue(self, writer, src, soffset, comment=""):
module = Module("Buffer Store Flag Value")
memOrder = Component.StreamKMemoryOrdering.find(writer)
tmpSgprBuffer = writer.sgprPool.checkOutAligned(4, 4, preventOverflow=False)
tmpVgprOff = writer.vgprPool.checkOut(1, "vaddr_off")
module.add(VMovB32(dst=vgpr(tmpVgprOff), src=0, comment="zero vaddr offset"))
module.add(SMovB64(dst=sgpr(tmpSgprBuffer, 2), src=sgpr("AddressFlags", 2)))
module.add(SMovB32(dst=sgpr(tmpSgprBuffer+2), src="BufferOOB"))
module.add(SMovB32(dst=sgpr(tmpSgprBuffer+3), src="Srd127_96"))
module.add(self.shiftSrd(writer, tmpSgprBuffer))
module.add(BufferStoreB32(src=src, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset, \
mubuf=MUBUFModifiers(offen=True, glc=True, dlc=True, scope=CacheScope.SCOPE_DEV), \
comment=comment))
module.add(memOrder.preVolatileVmem(writer, comment="drain xnacks before volatile VMEM store"))
module.add(BufferStoreB32(src=src, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset,
mubuf=memOrder.flagBufferMubuf(), comment=comment))
module.add(SWaitCnt(vscnt=0, comment="wait for data store"))
writer.vgprPool.checkIn(tmpVgprOff)
writer.sgprPool.checkIn(tmpSgprBuffer)

return module

def getFlagValue(self, writer, dst, soffset, comment=""):
"""Buffer-load primitive for the StreamK flag.

Used by `StreamKMemoryOrderingDevScopeFences.readFlag` to perform a
VMEM-coherent flag load. Default arches read the flag via SMEM
directly in `StreamKMemoryOrderingDefault.readFlag` and never call
this helper.
"""
module = Module("Buffer Load Flag Value")
memOrder = Component.StreamKMemoryOrdering.find(writer)
tmpSgprBuffer = writer.sgprPool.checkOutAligned(4, 4, preventOverflow=False)
tmpVgprOff = writer.vgprPool.checkOut(1, "vaddr_off")
module.add(VMovB32(dst=vgpr(tmpVgprOff), src=0, comment="zero vaddr offset"))
module.add(SMovB64(dst=sgpr(tmpSgprBuffer, 2), src=sgpr("AddressFlags", 2)))
module.add(SMovB32(dst=sgpr(tmpSgprBuffer+2), src="BufferOOB"))
module.add(SMovB32(dst=sgpr(tmpSgprBuffer+3), src="Srd127_96"))
module.add(self.shiftSrd(writer, tmpSgprBuffer))
module.add(memOrder.preVolatileVmem(writer, comment="drain xnacks before volatile VMEM load"))
module.add(BufferLoadB32(dst=dst, vaddr=vgpr(tmpVgprOff), saddr=sgpr(tmpSgprBuffer, 4), soffset=soffset,
mubuf=memOrder.flagBufferMubuf(),
comment=comment))
writer.vgprPool.checkIn(tmpVgprOff)
writer.sgprPool.checkIn(tmpSgprBuffer)

return module

def partialsWriteBatch(self, writer, kernel, ss, batchIdx, applyAlpha, beta, edge, gwvw, atomicW, \
batchElements, addrD, addrC, \
tmpVgpr, cvtVgprStruct, batchElementSgprs, tmpSgpr, codeAccVgprRead):
Expand Down
18 changes: 15 additions & 3 deletions projects/hipblaslt/tensilelite/Tensile/KernelWriter.py
Original file line number Diff line number Diff line change
Expand Up @@ -1809,7 +1809,12 @@ def calculateRangeAndUpdateCounter(itemCounter, writeCounters, length):
####
# scheduled wait localReads
####
if self.states.numItersPLR == 0 and kernel["EnableMatrixInstruction"] and self.do["OptimizeNumItersPLR0"] and not scheduleTF32Emu:
subIterMxNeedsDependentLrWait = (
kernel["ForceUnrollSubIter"] and self.states.version == (12,5,0)
and (kernel["ProblemType"]["MXBlockA"] or kernel["ProblemType"]["MXBlockB"])
)
if ((self.states.numItersPLR == 0 and self.do["OptimizeNumItersPLR0"]) or subIterMxNeedsDependentLrWait) \
and kernel["EnableMatrixInstruction"] and not scheduleTF32Emu:
dscnt = -1
mfmas = getMFMAs(macIterCode)
## To support do["MAC"] is False
Expand Down Expand Up @@ -3942,12 +3947,19 @@ def _loopBody( self, kernel, tensorParametersA, tensorParametersB, pack, packPre
vlcntVal = kernel["PrefetchGlobalRead"] - 1 if kernel["PrefetchGlobalRead"] >= 2 else 0
waitLWCode.add(self._wait(kernel, tensorParametersA, tensorParametersB, vlcntVal, -1, -1, \
"wait for previous set of global reads"))
elif kernel["enableTDMA"] and kernel["enableTDMB"]:
# TDM case: tensor_load_to_lds instructions (issued in prior iter) write to LDS via the
# tensor counter. A s_wait_tensorcnt 0 is required before the barrier to guarantee all
# TDM stores to LDS have landed before other waves read from that LDS buffer.
waitLWCode.add(self._wait(kernel, tensorParametersA, tensorParametersB, 0, -1, -1, \
"wait for TDM global reads"))
# (no local write code. Global read wait for DirectToLds is already done)
if not kernel["NoLdsWriteCode"]:
waitLWCode.add(self._wait(kernel, tensorParametersA, tensorParametersB, -1, 0, -1, "3wait for local write"))
skipForceWaitcnt0 = False
if kernel["DirectToVgprA"] or kernel["DirectToVgprB"] or kernel["DirectToLdsA"] or kernel["DirectToLdsB"]:
# DTVA/B or DTLA/B case, skip generating force waitcnt0
if kernel["DirectToVgprA"] or kernel["DirectToVgprB"] or kernel["DirectToLdsA"] or kernel["DirectToLdsB"] or \
(kernel["enableTDMA"] and kernel["enableTDMB"]):
# DTVA/B, DTLA/B, or TDM case: global read wait is handled above, skip force waitcnt0
skipForceWaitcnt0 = True
syncCode.add(self._syncThreads(kernel, "PGR, and wait until LW done to sync LDS%u"%(self.states.ldsBarrierTokenIdx), skipForceWaitcnt0=skipForceWaitcnt0, memoryToken=[self.states.ldsBarrierTokenIdx]))

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
TestParameters:
marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch
marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch

GlobalParameters:
NumElementsToValidate: -1
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
TestParameters:
marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch
marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch

GlobalParameters:
NumElementsToValidate: -1
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
TestParameters:
marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch
marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch
GlobalParameters:
NumElementsToValidate: -1
BoundsCheck: False
Expand Down Expand Up @@ -209,7 +209,7 @@ BenchmarkProblems:
- VectorWidthA: [-1]
- VectorWidthB: [-1]
- LocalReadVectorWidth: [-1]
- 1LDSBuffer: [0]
- 1LDSBuffer: [1, 0]
- DirectToVgprSparseMetadata: [0]
- StoreVectorWidth: [-1]
- StreamK: [3]
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
TestParameters:
marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201, skip-gfx1250] # not supported by arch
marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx940, skip-gfx941, skip-gfx942, skip-gfx950, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch
GlobalParameters:
NumElementsToValidate: -1
BoundsCheck: False
Expand Down
Loading
Loading