Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -306096,6 +306096,250 @@
reorderGRInstForDTVB: false
tailLoopOptA: true
tailLoopOptB: true
- 1LDSBuffer: 0
ActivationAlt: false
ActivationFuncCall: true
ActivationFused: true
AssertAIGreaterThanEqual: -1
AssertAILessThanEqual: -1
AssertFree0ElementMultiple: 1
AssertFree1ElementMultiple: 1
AssertSummationElementMultiple: 1
AssignedDerivedParameters: true
AssignedProblemIndependentDerivedParameters: true
BaseName: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1iMO6yjTG4yuZPDkcMXrcObwuUOXV_ipLalV5INSQqYE=
BufferLoad: true
BufferStore: true
CUCount: null
CUOccupancy: -1
ClusterLocalRead: 1
CodeObjectVersion: '4'
ConvertAfterDS: false
CustomKernelName: ''
DebugStreamK: 0
DepthU: 64
DirectToLds: true
DirectToLdsA: true
DirectToLdsB: true
DirectToVgprA: false
DirectToVgprB: false
DirectToVgprSparseMetadata: false
EdgeType: ShiftPtr
EnableF32XdlMathOp: false
EnableMatrixInstruction: true
ExpandPointerSwap: 0
ExpertSchedulingMode: 0
ForceDisableShadowInit: false
ForceUnrollSubIter: false
GlobalReadPerMfma: 1
GlobalReadVectorWidthA: 8
GlobalReadVectorWidthB: 8
GlobalSplitU: 0
GlobalSplitUAlgorithm: MultipleBuffer
GlobalSplitUCoalesced: false
GlobalSplitUWorkGroupMappingRoundRobin: false
GlobalWriteVectorWidth: 2
GroupLoadStore: false
GuaranteeNoPartialA: true
GuaranteeNoPartialB: true
GuaranteeNoPartialMetadata: true
ISA: [9, 5, 0]
InnerUnroll: 1
InterleaveAlpha: 0
InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true,
SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true}
Kernel: true
KernelLanguage: Assembly
KernelNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1
LDSTrInst: false
LSCA: 64
LSCB: 64
LSPA: 32
LSPB: 32
LVCA: 8
LVCB: 8
LVPA: 4
LVPB: 4
LdsBlockSizePerPadA: 1024
LdsBlockSizePerPadB: 1024
LdsBlockSizePerPadMetadata: 0
LdsBytesNoAmax: 123776
LdsInitCVgprs: false
LdsNumBytes: 123776
LdsNumElementsAlignedA: 24960
LdsNumElementsAlignedB: 33280
LdsNumElementsAlignedMetadata: 0
LdsOffsetA: 0
LdsOffsetA_Blk: 65536
LdsOffsetB: 24960
LdsOffsetB_Blk: 90496
LdsOffsetBias: 0
LdsOffsetBiasGSU: 0
LdsOffsetBiasNonGSU: 0
LdsOffsetMetadata: 24960
LdsOffsetMetadata_Blk: 90496
LdsPadA: 8
LdsPadB: 8
LdsPadMetadata: 0
LocalReadVectorWidth: 8
LocalSplitU: 1
LocalSplitUReuseLDS: 1
LocalWritePerMfma: -1
LocalWriteUseSgprA: true
LocalWriteUseSgprB: true
LoopIters: 2
LoopUnroll: 64
MFMA_BF16_1K: false
MIArchVgpr: false
MIBlock: [16, 16, 32, 1, 1, 1]
MIInputPerThread: 8
MIInputPerThreadA: 8
MIInputPerThreadB: 8
MIInputPerThreadMetadata: 8
MIOutputVectorWidth: 4
MIRegPerOut: 1
MIWaveGroup: [2, 2]
MIWaveTile: [6, 8]
MIWaveTileA: 6
MIWaveTileB: 8
MIWaveTileMetadata: 0
MacroTile0: 192
MacroTile1: 256
MacroTileA: 192
MacroTileB: 256
MagicDivAlg: 2
MathClocksUnrolledLoop: 0
MatrixInstB: 1
MatrixInstBM: 1
MatrixInstBN: 1
MatrixInstK: 32
MatrixInstM: 16
MatrixInstN: 16
MatrixInstruction: [16, 16, 32, 1]
MaxLDS: 163840
MaxOccupancy: 40
MbskPrefetchMethod: 0
MfmaInitCVgprs: false
NoLdsWriteCode: true
NoReject: false
NoTailLoop: false
NonDTLTailLoopA: true
NonDTLTailLoopB: true
NonTemporal: -1
NonTemporalA: 0
NonTemporalB: 0
NonTemporalC: 0
NonTemporalD: 4
NonTemporalE: 0
NonTemporalMetadata: 0
NonTemporalWS: 0
NumElementsPerBatchStore: 0
NumElementsPerThread: 192
NumGlobalWriteVectorsPerThread: 96
NumLoadsA: 6
NumLoadsB: 8
NumLoadsCoalescedA: 1
NumLoadsCoalescedB: 1
NumLoadsPerpendicularA: 6
NumLoadsPerpendicularB: 8
NumThreads: 256
NumTotalPackedLoadsA: 6
NumTotalPackedLoadsB: 8
NumWaveSplitK: 1
OptNoLoadLoop: 1
PackedC0IdxChars: [I]
PackedC0IndicesX: [0]
PackedC1IdxChars: [J]
PackedC1IndicesX: [1]
PrefetchGlobalRead: 2
PrefetchLocalRead: 1
PreloadKernArgs: true
SFCWGM:
- [1, 1]
- [1, 1]
ScheduleGlobalRead: 1
ScheduleIterAlg: 3
ScheduleLocalWrite: 1
SolutionIndex: 1308
SolutionNameMin: Cijk_Alik_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI0_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW2_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA2_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1
SourceSwap: 1
SpaceFillingAlgo: []
StaggerU: 0
StaggerUMapping: 0
StaggerUStride: 128
StorePriorityOpt: false
StoreRemapVectorWidth: 0
StoreSwapAddr: false
StoreSyncOpt: 0
StoreVectorWidth: 2
StreamK: 3
StreamKAtomic: 0
StreamKFixupTreeReduction: 0
StreamKXCCMapping: 0
SubGroup0: 8
SubGroup1: 32
SubGroupA: 8
SubGroupB: 32
SuppressNoLoadLoop: false
SwapGlobalReadOrder: false
ThreadTile: [1, 1]
ThreadTile0: 24
ThreadTile1: 8
ThreadTileA: 24
ThreadTileB: 8
TransposeLDS: 1
TransposeLDSMetadata: true
ULSGRODoubleG2L: 0
UnrollLoopSwapGlobalReadOrder: 0
UnrollMajorLDSA: true
UnrollMajorLDSB: true
UnrollMajorLDSMetadata: true
Use64bShadowLimit: 1
UseCustomMainLoopSchedule: true
UseDirect32XEmulation: false
UseDot2F32XEmulation: false
UseDotInstruction: false
UseF32XEmulation: false
UseGeneralizedNLCOneA: true
UseGeneralizedNLCOneB: true
UseGeneralizedNLCOneMetadata: false
UseInstOffsetForGRO: 0
UsePLRPack: false
UseSgprForGRO: 0
Valid: true
VectorStore: -1
VectorWidthA: 2
VectorWidthB: 8
WaveSeparateGlobalReadA: 0
WaveSeparateGlobalReadB: 0
WaveSeparateGlobalReadMetadata: 0
WaveSplitK: false
WavefrontSize: 64
WorkGroup: [32, 8, 1]
WorkGroupMapping: 16
WorkGroupMappingXCC: 2
WorkGroupMappingXCCGroup: -1
WorkGroupReduction: false
WorkspaceCheck: [4, 0, 0]
_DepthU: 64
_DepthUA: 64
_DepthUB: 64
_DepthUMetadata: 64
_GlobalAccumulation: PartialsBuffer
_UseSgprForGRO: 0
_VectorStore: 1
_WorkspaceSizePerElemBias: 0
_WorkspaceSizePerElemC: 4
_staggerStrideShift: 0
enableGLTrA: false
enableGLTrB: false
enableLDSTrA: false
enableLDSTrB: false
numSubTiles: 1
reorderGRInstForDTVA: false
reorderGRInstForDTVB: false
tailLoopOptA: false
tailLoopOptB: false
- [2, 3, 0, 1]
- - - [16, 368640, 1, 224]
- [0, 0.0]
Expand Down Expand Up @@ -308735,6 +308979,8 @@
- [1305, 0.0]
- - [1920, 2048, 1, 15964]
- [1306, 0.0]
- - [3072, 4096, 1, 8192]
- [1308, 0.0]
- null
- null
- DeviceEfficiency
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,47 @@ def _get_schedule_192x256x64_16bit(kernel, useLDSTr, TLDS):
SBarrier(comment=""),
SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="Wait for LRB0 to complete"),]
nglshift = nllshift = 14 # vmcnt shift for ngl and nll
elif isTN(kernel) and not useLDSTr and TLDS == 1:
#index and code pair
syncTable = [-1, SWaitCnt(dscnt=7, vlcnt=-1, vscnt=-1, comment="for LRB1-0"),
6, SWaitCnt(dscnt=6+5, vlcnt=-1, vscnt=-1, comment="for LRB1-1"),
8, SBarrier(comment="for GRA start"),
11, SWaitCnt(dscnt=5+8, vlcnt=-1, vscnt=-1, comment="for LRB1-2"),
17, SWaitCnt(dscnt=4+11, vlcnt=-1, vscnt=-1, comment="for LRB1-3"),
23, SWaitCnt(dscnt=15, vlcnt=-1, vscnt=-1, comment="for LRB1-4:6"),
41, SWaitCnt(dscnt=14, vlcnt=-1, vscnt=-1, comment="for LRB1-7"),
46, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRA1"),
48, SBarrier(comment="for LRA1 start"),
78, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="for LRB1"),
78, SBarrier(comment="for LRB1 start"),]
optSchedule = {
'SYNC' : [syncTable[::2]],
'GRIncA': [[0,1,2,3,4,5,6,7,8]],
'GRIncB': [[9,10,11,12,13,14,15,16,17]],

'LRA0' : [[0, 2, 3, 4, 5, 6]],
'LRB0' : [[7, 9, 11, 13, 15, 17, 19, 21],
[8, 10, 12, 14, 16, 18, 20, 22]],
'GRA' : [[8,8, 10,10, 12,12, 14,14, 26,26, 31,31],
[9,9, 11,11, 13,13, 15,15, 27,27, 32,32]],

'GRB' : [[46,46, 50,50, 54,54, 58,58, 62,62, 66,66, 70,70, 76,76],
[47,47, 51,51, 55,55, 59,59, 63,63, 67,67, 71,71, 77,77]],
'LRA1' : [[48, 52, 56, 58, 60, 64],
[49, 53, 57, 59, 61, 65]],
# 0 1 2 3 4 5 6 7
'LRB1' : [[78, 80, 82, 84, 86, 90, 92, 94],
[79, 81, 83, 85, 87, 91, 93, 95]],

'LRSA' : [[22]],
'LRSB' : [[23]],

'LWSA' : [[20]],
'LWSB' : [[78]],
'LCC' : [[95, 95]],
}
syncCode = syncTable[1::2]
nglshift = nllshift = 14 # vmcnt shift for ngl and nll
else:
return False, None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,43 @@ BenchmarkProblems:
- Range: [[613], [612], [1], [1, 1, 64]]
- Exact: [8192, 4096, 1, 2048]
- BiasTypeArgs: ['b']

- # BenchmarkProblemSizeGroup - Standard - All problem
InitialSolutionParameters:
BenchmarkCommonParameters:
- KernelLanguage: ["Assembly"]
ForkParameters:
- MatrixInstruction:
- [16, 16,32, 1, 1, 6, 8, 2,2 ]
- PrefetchGlobalRead: [2]
- PrefetchLocalRead: [1]
- DepthU: [64]
- ScheduleIterAlg: [3]
- ExpandPointerSwap: [0]
- TransposeLDS: [1] #0,1
- LocalReadVectorWidth: [8]
- GlobalReadVectorWidthA: [8]
- GlobalReadVectorWidthB: [8]
- DirectToLds: [1]
- StreamK: [3]
- LdsPadA: [8] #[-1]
- LdsPadB: [8] #[-1]
- StaggerU: [0]
- WorkGroupMapping: [16]
- WorkGroupMappingXCC: [2]
- 1LDSBuffer: [0]
- NonTemporalD: [4]
- SourceSwap: [1]
- UseSgprForGRO: [0]
- UseCustomMainLoopSchedule: [0, 1]
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Range: [[192], [256], [1], [64, 64, 256]]
- Range: [[192], [256], [1], [1, 1, 64]]
- Range: [[192], [256], [1], [32, 64, 256]]
- Range: [[6144], [8192], [1], [64, 64, 256]]
- Exact: [3072, 4096, 1, 8192]
- BiasTypeArgs: ['b']
########################################
# HHS TN - standard
########################################
Expand Down