Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
0adc241
Refactors CustomSchedule.py for scale out of schedules (#2574)
talumbau Nov 11, 2025
7641aa7
add 192x256x64TN
jfactory07 Nov 12, 2025
8f47b79
typo
jfactory07 Nov 12, 2025
1b39b51
add test
jfactory07 Nov 10, 2025
485c703
more test
jfactory07 Nov 12, 2025
c1b8911
Merge branch 'hipblaslt_common_cms_dev' into users/jzhou/cms-192x256x…
jfactory07 Nov 14, 2025
5a943a5
typo
jfactory07 Nov 14, 2025
461fb4f
fix random fail
jfactory07 Nov 17, 2025
634b8ae
Merge branch 'hipblaslt_common_cms_dev' into users/jzhou/cms-192x256x…
jfactory07 Nov 20, 2025
ba7d50b
refine
jfactory07 Nov 20, 2025
366831f
Merge branch 'hipblaslt_common_cms_dev' into users/jzhou/cms-192x256x…
jfactory07 Nov 20, 2025
3cebdbf
Merge branch 'hipblaslt_common_cms_dev' into users/jzhou/cms-192x256x…
jfactory07 Nov 20, 2025
564ae51
Merge branch 'hipblaslt_common_cms_dev' into users/jzhou/cms-192x256x…
jfactory07 Nov 21, 2025
4d3c807
remove typo
jfactory07 Nov 21, 2025
5a79662
typo
jfactory07 Nov 21, 2025
9f33c6f
Merge branch 'hipblaslt_common_cms_dev' into users/jzhou/cms-192x256x…
jfactory07 Nov 24, 2025
f0865c6
Merge branch 'hipblaslt_common_cms_dev' into users/jzhou/cms-192x256x…
jfactory07 Nov 24, 2025
3abb511
Merge branch 'hipblaslt_common_cms_dev' into users/jzhou/cms-192x256x…
jfactory07 Nov 25, 2025
6c7223e
typo
jfactory07 Nov 25, 2025
f9f9b0c
typo
jfactory07 Nov 25, 2025
1c4f0c0
typo
jfactory07 Nov 25, 2025
84c1192
rename
jfactory07 Nov 25, 2025
eed850a
refine
jfactory07 Nov 26, 2025
5a125e0
Merge branch 'hipblaslt_common_cms_dev' into users/jzhou/cms-192x256x…
jfactory07 Nov 26, 2025
efda36c
typo
jfactory07 Nov 28, 2025
0dca66c
Merge branch 'hipblaslt_common_cms_dev' into users/jzhou/cms-192x256x…
jfactory07 Nov 28, 2025
c4b1d4a
disable LDSTrInst in test
jfactory07 Nov 28, 2025
77c3ebd
refine
jfactory07 Dec 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -308290,7 +308290,251 @@
reorderGRInstForDTVA: false
reorderGRInstForDTVB: false
tailLoopOptA: false
tailLoopOptB: false
tailLoopOptB: false
- 1LDSBuffer: 0
ActivationAlt: false
ActivationFuncCall: false
ActivationFused: true
AssertAIGreaterThanEqual: -1
AssertAILessThanEqual: -1
AssertFree0ElementMultiple: 1
AssertFree1ElementMultiple: 1
AssertSummationElementMultiple: 1
AssignedDerivedParameters: true
AssignedProblemIndependentDerivedParameters: true
BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_SAV_UserArgs_MT192x25GrlcXKC7t-UqBC8MTU-ba4yBkbzpcFIFSmhpwzdMe9w=
BufferLoad: true
BufferStore: true
CUCount: null
CUOccupancy: -1
ClusterLocalRead: 1
CodeObjectVersion: '4'
ConvertAfterDS: false
CustomKernelName: ''
DebugStreamK: 0
DepthU: 64
DirectToLds: true
DirectToLdsA: true
DirectToLdsB: true
DirectToVgprA: false
DirectToVgprB: false
DirectToVgprSparseMetadata: false
EdgeType: ShiftPtr
EnableF32XdlMathOp: false
EnableMatrixInstruction: true
ExpandPointerSwap: 0
ExpertSchedulingMode: 0
ForceDisableShadowInit: false
ForceUnrollSubIter: false
GlobalReadPerMfma: 1
GlobalReadVectorWidthA: 8
GlobalReadVectorWidthB: 8
GlobalSplitU: 0
GlobalSplitUAlgorithm: MultipleBuffer
GlobalSplitUCoalesced: false
GlobalSplitUWorkGroupMappingRoundRobin: false
GlobalWriteVectorWidth: 2
GroupLoadStore: false
GuaranteeNoPartialA: true
GuaranteeNoPartialB: true
GuaranteeNoPartialMetadata: true
ISA: [9, 5, 0]
InnerUnroll: 1
InterleaveAlpha: 0
InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true,
SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true}
Kernel: true
KernelLanguage: Assembly
KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1
LDSTrInst: false
LSCA: 64
LSCB: 64
LSPA: 32
LSPB: 32
LVCA: 8
LVCB: 8
LVPA: 4
LVPB: 4
LdsBlockSizePerPadA: 1024
LdsBlockSizePerPadB: 1024
LdsBlockSizePerPadMetadata: 0
LdsBytesNoAmax: 123776
LdsInitCVgprs: false
LdsNumBytes: 123776
LdsNumElementsAlignedA: 24960
LdsNumElementsAlignedB: 33280
LdsNumElementsAlignedMetadata: 0
LdsOffsetA: 0
LdsOffsetA_Blk: 65536
LdsOffsetB: 24960
LdsOffsetB_Blk: 90496
LdsOffsetBias: 0
LdsOffsetBiasGSU: 0
LdsOffsetBiasNonGSU: 0
LdsOffsetMetadata: 24960
LdsOffsetMetadata_Blk: 90496
LdsPadA: 8
LdsPadB: 8
LdsPadMetadata: 0
LocalReadVectorWidth: 8
LocalSplitU: 1
LocalSplitUReuseLDS: 1
LocalWritePerMfma: -1
LocalWriteUseSgprA: true
LocalWriteUseSgprB: true
LoopIters: 2
LoopUnroll: 64
MFMA_BF16_1K: false
MIArchVgpr: false
MIBlock: [16, 16, 32, 1, 1, 1]
MIInputPerThread: 8
MIInputPerThreadA: 8
MIInputPerThreadB: 8
MIInputPerThreadMetadata: 8
MIOutputVectorWidth: 4
MIRegPerOut: 1
MIWaveGroup: [2, 2]
MIWaveTile: [6, 8]
MIWaveTileA: 6
MIWaveTileB: 8
MIWaveTileMetadata: 0
MacroTile0: 192
MacroTile1: 256
MacroTileA: 192
MacroTileB: 256
MagicDivAlg: 2
MathClocksUnrolledLoop: 0
MatrixInstB: 1
MatrixInstBM: 1
MatrixInstBN: 1
MatrixInstK: 32
MatrixInstM: 16
MatrixInstN: 16
MatrixInstruction: [16, 16, 32, 1]
MaxLDS: 163840
MaxOccupancy: 40
MbskPrefetchMethod: 0
MfmaInitCVgprs: true
NoLdsWriteCode: true
NoReject: false
NoTailLoop: false
NonDTLTailLoopA: true
NonDTLTailLoopB: true
NonTemporal: -1
NonTemporalA: 0
NonTemporalB: 0
NonTemporalC: 0
NonTemporalD: 4
NonTemporalE: 0
NonTemporalMetadata: 0
NonTemporalWS: 0
NumElementsPerBatchStore: 0
NumElementsPerThread: 192
NumGlobalWriteVectorsPerThread: 96
NumLoadsA: 6
NumLoadsB: 8
NumLoadsCoalescedA: 1
NumLoadsCoalescedB: 1
NumLoadsPerpendicularA: 6
NumLoadsPerpendicularB: 8
NumThreads: 256
NumTotalPackedLoadsA: 6
NumTotalPackedLoadsB: 8
NumWaveSplitK: 1
OptNoLoadLoop: 1
PackedC0IdxChars: [I]
PackedC0IndicesX: [0]
PackedC1IdxChars: [J]
PackedC1IndicesX: [1]
PrefetchGlobalRead: 2
PrefetchLocalRead: 1
PreloadKernArgs: true
SFCWGM:
- [1, 1]
- [1, 1]
ScheduleGlobalRead: 1
ScheduleIterAlg: 3
ScheduleLocalWrite: 1
SolutionIndex: 1317
SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1
SourceSwap: 1
SpaceFillingAlgo: []
StaggerU: 0
StaggerUMapping: 0
StaggerUStride: 128
StorePriorityOpt: false
StoreRemapVectorWidth: 0
StoreSwapAddr: false
StoreSyncOpt: 0
StoreVectorWidth: 2
StreamK: 3
StreamKAtomic: 0
StreamKFixupTreeReduction: 0
StreamKXCCMapping: 0
SubGroup0: 8
SubGroup1: 32
SubGroupA: 8
SubGroupB: 32
SuppressNoLoadLoop: false
SwapGlobalReadOrder: false
ThreadTile: [1, 1]
ThreadTile0: 24
ThreadTile1: 8
ThreadTileA: 24
ThreadTileB: 8
TransposeLDS: 1
TransposeLDSMetadata: true
ULSGRODoubleG2L: 0
UnrollLoopSwapGlobalReadOrder: 0
UnrollMajorLDSA: true
UnrollMajorLDSB: true
UnrollMajorLDSMetadata: true
Use64bShadowLimit: 1
UseCustomMainLoopSchedule: true
UseDirect32XEmulation: false
UseDot2F32XEmulation: false
UseDotInstruction: false
UseF32XEmulation: false
UseGeneralizedNLCOneA: true
UseGeneralizedNLCOneB: true
UseGeneralizedNLCOneMetadata: false
UseInstOffsetForGRO: 0
UsePLRPack: false
UseSgprForGRO: 0
Valid: true
VectorStore: -1
VectorWidthA: 2
VectorWidthB: 8
WaveSeparateGlobalReadA: 0
WaveSeparateGlobalReadB: 0
WaveSeparateGlobalReadMetadata: 0
WaveSplitK: false
WavefrontSize: 64
WorkGroup: [32, 8, 1]
WorkGroupMapping: 16
WorkGroupMappingXCC: 2
WorkGroupMappingXCCGroup: -1
WorkGroupReduction: false
WorkspaceCheck: [4, 0, 0]
_DepthU: 64
_DepthUA: 64
_DepthUB: 64
_DepthUMetadata: 64
_GlobalAccumulation: PartialsBuffer
_UseSgprForGRO: 0
_VectorStore: 1
_WorkspaceSizePerElemBias: 0
_WorkspaceSizePerElemC: 4
_staggerStrideShift: 0
enableGLTrA: false
enableGLTrB: false
enableLDSTrA: false
enableLDSTrB: false
numSubTiles: 1
reorderGRInstForDTVA: false
reorderGRInstForDTVB: false
tailLoopOptA: false
tailLoopOptB: false
- [2, 3, 0, 1]
- - - [16, 368640, 1, 224]
- [0, 0.0]
Expand Down Expand Up @@ -311467,7 +311711,9 @@
- - [3840, 4096, 1, 8192]
- [1315, 0.0]
- - [3328, 4096, 1, 8192]
- [1316, 0.0]
- [1316, 0.0]
- - [3072, 4096, 1, 8192]
- [1317, 0.0]
- null
- null
- DeviceEfficiency
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,45 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS):
SBarrier(comment=""),
SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"),]
nglshift = nllshift = 14 # vmcnt shift for ngl and nll
elif isTN(kernel) and not useLDSTr and TLDS == 1:
#index and code pair
syncTable = [-1, SWaitCnt(dscnt=7, vlcnt=-1, vscnt=-1, comment="for LRB1-0"),
5, SWaitCnt(dscnt=5, vlcnt=-1, vscnt=-1, comment="for LRB1"),
14, SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="for LRA0 complete"),
14, SBarrier(comment="for GRA start"),
46, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="for LRB0"),
46, SBarrier(comment="for GRB start"),
50, SWaitCnt(dscnt=-1, vlcnt=14+1, vscnt=-1, comment="for LRA1"),
50, SBarrier(comment="for LRA1 start"),
65, SWaitCnt(dscnt=-1, vlcnt=6+5, vscnt=-1, comment="for LRB0"),
65, SBarrier(comment="for LRB1 start"),]
optSchedule = {
'SYNC' : [syncTable[::2]],
'GRIncA': [[6,6,7,7,8,8,9,9,9]],
'GRIncB': [[9,10,11,12,13,14,15,16,17]],

'LRA0' : [[0, 1, 2, 3, 4, 5],
[-1, 0, 1, 2, 3, 4]],
'LRB0' : [[7, 9, 11, 13, 15, 17, 19, 21],
[8, 10, 12, 13, 16, 18, 20, 22]],
'GRA' : [[14,14, 16,16, 18,18, 20,20, 25,25, 31,31],
[15,15, 17,17, 19,19, 21,21, 26,26, 32,32]],

'GRB' : [[46,46, 50,50, 54,54, 58,58, 62,62, 66,66, 70,70, 76,76],
Comment thread
jfactory07 marked this conversation as resolved.
[47,47, 51,51, 55,55, 59,59, 63,63, 67,67, 71,71, 77,77]],
'LRA1' : [[50, 52, 56, 58, 60, 62],
[51, 53, 57, 59, 61, 63]],
'LRB1' : [[65, 67, 69, 71, 73, 75, 77, 79],
[66, 68, 70, 72, 74, 76, 78, 80]],

'LRSA' : [[47]],
'LRSB' : [[47]],
'LWSA' : [[47]],
'LWSB' : [[80]],
'LCC' : [[95, 95]],
}
syncCode = syncTable[1::2]
nglshift = nllshift = 14 # vmcnt shift for ngl and nll
elif isNT(kernel) and not useLDSTr and TLDS == 0:
kernel["UsePLRPack"] = True

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1055,6 +1055,44 @@ BenchmarkProblems:
- Range: [[208], [256], [1], [32, 64, 256]]
- Exact: [3328, 4096, 1, 8192]
- BiasTypeArgs: ['b']
- # BenchmarkProblemSizeGroup - Standard - All problem
InitialSolutionParameters:
BenchmarkCommonParameters:
- KernelLanguage: ["Assembly"]
ForkParameters:
- MatrixInstruction:
- [16, 16,32, 1, 1, 6, 8, 2,2 ]
- PrefetchGlobalRead: [2]
- PrefetchLocalRead: [1]
- DepthU: [64]
- ScheduleIterAlg: [3]
- ExpandPointerSwap: [0]
- TransposeLDS: [1] #0,1
- LDSTrInst: [0]
- LocalReadVectorWidth: [8]
- GlobalReadVectorWidthA: [8]
- GlobalReadVectorWidthB: [8]
- DirectToLds: [1]
- StreamK: [3]
- LdsPadA: [8] #[-1]
- LdsPadB: [8] #[-1]
- StaggerU: [0]
- WorkGroupMapping: [16]
- WorkGroupMappingXCC: [2]
- 1LDSBuffer: [0]
- NonTemporalD: [4]
- SourceSwap: [1]
- UseSgprForGRO: [0]
- UseCustomMainLoopSchedule: [0, 1]
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Range: [[192], [256], [1], [64, 64, 256]]
- Range: [[192], [256], [1], [1, 1, 64]]
- Range: [[192], [256], [1], [32, 64, 256]]
- Range: [[6144], [8192], [1], [64, 64, 256]]
- Exact: [3072, 4096, 1, 8192]
- BiasTypeArgs: ['b']

########################################
# HHS TN - standard
Expand Down