Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -130549,6 +130549,250 @@
reorderGRInstForDTVB: false
tailLoopOptA: false
tailLoopOptB: false
- 1LDSBuffer: 0
ActivationAlt: false
ActivationFuncCall: false
ActivationFused: true
AssertAIGreaterThanEqual: -1
AssertAILessThanEqual: -1
AssertFree0ElementMultiple: 1
AssertFree1ElementMultiple: 1
AssertSummationElementMultiple: 1
AssignedDerivedParameters: true
AssignedProblemIndependentDerivedParameters: true
BaseName: Cijk_Ailk_Bljk_BBS_BH_Bias_SAV_UserArgs_MT256x19efg19fI0a0QQO2NPQwOSTE-N35cXpB90pMQj9VJB-ns=
BufferLoad: true
BufferStore: true
CUCount: null
CUOccupancy: -1
ClusterLocalRead: 1
CodeObjectVersion: '4'
ConvertAfterDS: false
CustomKernelName: ''
DebugStreamK: 0
DepthU: 64
DirectToLds: true
DirectToLdsA: true
DirectToLdsB: true
DirectToVgprA: false
DirectToVgprB: false
DirectToVgprSparseMetadata: false
EdgeType: ShiftPtr
EnableF32XdlMathOp: false
EnableMatrixInstruction: true
ExpandPointerSwap: 0
ExpertSchedulingMode: 0
ForceDisableShadowInit: false
ForceUnrollSubIter: false
GlobalReadPerMfma: 1
GlobalReadVectorWidthA: 8
GlobalReadVectorWidthB: 8
GlobalSplitU: 0
GlobalSplitUAlgorithm: MultipleBuffer
GlobalSplitUCoalesced: false
GlobalSplitUWorkGroupMappingRoundRobin: false
GlobalWriteVectorWidth: 1
GroupLoadStore: false
GuaranteeNoPartialA: false
GuaranteeNoPartialB: true
GuaranteeNoPartialMetadata: true
ISA: [9, 5, 0]
InnerUnroll: 1
InterleaveAlpha: 0
InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true,
SupportUserGSU: false, UseSFC: false, UseUniversalArgs: true}
Kernel: true
KernelLanguage: Assembly
KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1
LDSTrInst: 1
LSCA: 256
LSCB: 64
LSPA: 8
LSPB: 32
LVCA: 32
LVCB: 8
LVPA: 1
LVPB: 4
LdsBlockSizePerPadA: 1024
LdsBlockSizePerPadB: 1024
LdsBlockSizePerPadMetadata: 0
LdsBytesNoAmax: 123776
LdsInitCVgprs: false
LdsNumBytes: 123776
LdsNumElementsAlignedA: 33280
LdsNumElementsAlignedB: 24960
LdsNumElementsAlignedMetadata: 0
LdsOffsetA: 0
LdsOffsetA_Blk: 65536
LdsOffsetB: 33280
LdsOffsetB_Blk: 98816
LdsOffsetBias: 0
LdsOffsetBiasGSU: 0
LdsOffsetBiasNonGSU: 0
LdsOffsetMetadata: 33280
LdsOffsetMetadata_Blk: 98816
LdsPadA: 8
LdsPadB: 8
LdsPadMetadata: 0
LocalReadVectorWidth: 8
LocalSplitU: 1
LocalSplitUReuseLDS: 1
LocalWritePerMfma: -1
LocalWriteUseSgprA: true
LocalWriteUseSgprB: true
LoopIters: 2
LoopUnroll: 64
MFMA_BF16_1K: false
MIArchVgpr: false
MIBlock: [16, 16, 32, 1, 1, 1]
MIInputPerThread: 8
MIInputPerThreadA: 8
MIInputPerThreadB: 8
MIInputPerThreadMetadata: 8
MIOutputVectorWidth: 4
MIRegPerOut: 1
MIWaveGroup: [2, 2]
MIWaveTile: [8, 6]
MIWaveTileA: 8
MIWaveTileB: 6
MIWaveTileMetadata: 0
MacroTile0: 256
MacroTile1: 192
MacroTileA: 256
MacroTileB: 192
MagicDivAlg: 2
MathClocksUnrolledLoop: 0
MatrixInstB: 1
MatrixInstBM: 1
MatrixInstBN: 1
MatrixInstK: 32
MatrixInstM: 16
MatrixInstN: 16
MatrixInstruction: [16, 16, 32, 1]
MaxLDS: 163840
MaxOccupancy: 40
MbskPrefetchMethod: 0
MfmaInitCVgprs: true
NoLdsWriteCode: true
NoReject: false
NoTailLoop: false
NonDTLTailLoopA: true
NonDTLTailLoopB: true
NonTemporal: -1
NonTemporalA: 0
NonTemporalB: 0
NonTemporalC: 0
NonTemporalD: 4
NonTemporalE: 0
NonTemporalMetadata: 0
NonTemporalWS: 0
NumElementsPerBatchStore: 0
NumElementsPerThread: 192
NumGlobalWriteVectorsPerThread: 192
NumLoadsA: 8
NumLoadsB: 6
NumLoadsCoalescedA: 1
NumLoadsCoalescedB: 1
NumLoadsPerpendicularA: 8
NumLoadsPerpendicularB: 6
NumThreads: 256
NumTotalPackedLoadsA: 8
NumTotalPackedLoadsB: 6
NumWaveSplitK: 1
OptNoLoadLoop: 1
PackedC0IdxChars: [I]
PackedC0IndicesX: [0]
PackedC1IdxChars: [J]
PackedC1IndicesX: [1]
PrefetchGlobalRead: 2
PrefetchLocalRead: 1
PreloadKernArgs: true
SFCWGM:
- [1, 1]
- [1, 1]
ScheduleGlobalRead: 1
ScheduleIterAlg: 3
ScheduleLocalWrite: 1
SolutionIndex: 552
SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_SAV_UserArgs_MT256x192x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA1_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA1024_LBSPPB1024_LBSPPM0_LPA8_LPB8_LPM0_LRVW8_LWPMn1_MIAV0_MIWT8_6_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB2_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM8_WGMXCC2_WGMXCCGn1
SourceSwap: 1
SpaceFillingAlgo: []
StaggerU: 0
StaggerUMapping: 0
StaggerUStride: 128
StorePriorityOpt: false
StoreRemapVectorWidth: 0
StoreSwapAddr: false
StoreSyncOpt: 0
StoreVectorWidth: 1
StreamK: 3
StreamKAtomic: 0
StreamKFixupTreeReduction: 0
StreamKXCCMapping: 0
SubGroup0: 8
SubGroup1: 32
SubGroupA: 8
SubGroupB: 32
SuppressNoLoadLoop: false
SwapGlobalReadOrder: true
ThreadTile: [1, 1]
ThreadTile0: 32
ThreadTile1: 6
ThreadTileA: 32
ThreadTileB: 6
TransposeLDS: 1
TransposeLDSMetadata: true
ULSGRODoubleG2L: 0
UnrollLoopSwapGlobalReadOrder: 0
UnrollMajorLDSA: false
UnrollMajorLDSB: true
UnrollMajorLDSMetadata: true
Use64bShadowLimit: 1
UseCustomMainLoopSchedule: true
UseDirect32XEmulation: false
UseDot2F32XEmulation: false
UseDotInstruction: false
UseF32XEmulation: false
UseGeneralizedNLCOneA: true
UseGeneralizedNLCOneB: true
UseGeneralizedNLCOneMetadata: false
UseInstOffsetForGRO: 0
UsePLRPack: false
UseSgprForGRO: 0
Valid: true
VectorStore: -1
VectorWidthA: 1
VectorWidthB: 2
WaveSeparateGlobalReadA: 0
WaveSeparateGlobalReadB: 0
WaveSeparateGlobalReadMetadata: 0
WaveSplitK: false
WavefrontSize: 64
WorkGroup: [32, 8, 1]
WorkGroupMapping: 8
WorkGroupMappingXCC: 2
WorkGroupMappingXCCGroup: -1
WorkGroupReduction: false
WorkspaceCheck: [4, 0, 0]
_DepthU: 64
_DepthUA: 64
_DepthUB: 64
_DepthUMetadata: 64
_GlobalAccumulation: PartialsBuffer
_UseSgprForGRO: 0
_VectorStore: 1
_WorkspaceSizePerElemBias: 0
_WorkspaceSizePerElemC: 4
_staggerStrideShift: 0
enableGLTrA: false
enableGLTrB: false
enableLDSTrA: true
enableLDSTrB: false
numSubTiles: 1
reorderGRInstForDTVA: false
reorderGRInstForDTVB: false
tailLoopOptA: false
tailLoopOptB: false
- [2, 3, 0, 1]
- - - [112, 491520, 1, 128]
- [0, 0.0]
Expand Down Expand Up @@ -131658,6 +131902,8 @@
- [550, 0]
- - [3072, 5120, 1, 8192]
- [551, 0.0]
- - [4096, 3072, 1, 8192]
- [552, 0.0]
- null
- null
- DeviceEfficiency
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -582,6 +582,45 @@ def _get_schedule_256x192x64_16bit(kernel, useLDSTr, TLDS):
}
syncCode = syncTable[1::2]
nglshift = nllshift = 14 # vmcnt shift for ngl and nll
elif isNN(kernel) and useLDSTr and TLDS == 1:
kernel["SwapGlobalReadOrder"] = True
#index and code pair
syncTable = [-1, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment="wait for LRA1"),
15, SWaitCnt(dscnt=4, vlcnt=-1, vscnt=-1, comment="wait for LRB0"),
Comment thread
jfactory07 marked this conversation as resolved.
15, SBarrier(comment=""),
46, SWaitCnt(dscnt=0, vlcnt=-1, vscnt=-1, comment=""),
51, SWaitCnt(dscnt=-1, vlcnt=14, vscnt=-1, comment="wait for previous set of global reads"),
51, SBarrier(comment=""),
63, SWaitCnt(dscnt=-1, vlcnt=14-4, vscnt=-1, comment="wait for previous set of global reads"),
63, SBarrier(comment=""),
]
optSchedule = {
'SYNC' : [syncTable[::2]],
'GRIncA': [[0,1,2,3,4,5,6,7,8]],
'GRIncB': [[9,10,11,12,13,14,15,16,17]],

'LRB0': [[-1, 0, 1, 2, 3, 4],
[0, 1, 2, 3, 4, 5]],
'LRA0': [[6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 25, 27, 29, 31, 33, 35],
[7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 26, 28, 30, 32, 34, 36]],
'GRA': [[15,15, 17,17, 27,27, 29,29, 31,31, 33,33],
[16, 16, 18, 18, 28, 28, 30, 30, 32, 32, 34, 34]],

'GRB': [[51,51, 53,53, 55,55, 57,57, 67,67, 69,69, 71,71, 73,73],
[52,52, 54,54, 56,56, 58,58, 68,68, 70,70, 72,72, 74,74]],
'LRB1': [[51, 53, 55, 57, 59, 61],
[52, 54, 56, 58, 60, 62]],
'LRA1': [[63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93],
[64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94]],

'LRSB': [[14]],
'LRSA': [[45]],
'LWSB': [[94]],
'LWSA': [[94]],
'LCC' : [[95, 95]],
}
syncCode = syncTable[1::2]
nglshift = nllshift = 14 # vmcnt shift for ngl and nll
else:
return False, None

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,48 @@ BenchmarkProblems:
- Range: [[192], [320], [1], [32, 64, 256]]
- Range: [[6144], [10240], [1], [64, 64, 256]]
- BiasTypeArgs: ['b']
- # BenchmarkProblemSizeGroup - Standard - All problem
InitialSolutionParameters:
BenchmarkCommonParameters:
- KernelLanguage: ["Assembly"]
ForkParameters:
- MatrixInstruction:
- [16, 16,32, 1, 1, 8, 6, 2,2 ]
- PrefetchGlobalRead: [2]
- PrefetchLocalRead: [1]
- DepthU: [64]
- ScheduleIterAlg: [3]
- ExpandPointerSwap: [0]
- TransposeLDS: [1]
- LocalReadVectorWidth: [8]
- GlobalReadVectorWidthA: [8]
- GlobalReadVectorWidthB: [8]
- DirectToLds: [1]
- StreamK: [3]
- LdsPadA: [-1]
- LdsPadB: [-1]
- StaggerU: [0]
- 1LDSBuffer: [0]
- NonTemporalA: [3]
- NonTemporalB: [3]
- NonTemporalD: [4]
- SourceSwap: [1]
- LdsPadA: [-1]
- LdsPadB: [-1]
- LdsBlockSizePerPadA: [-1]
- LdsBlockSizePerPadB: [-1]
- LDSTrInst: [1]
- UseSgprForGRO: [0]
- UseCustomMainLoopSchedule: [0,1]
BenchmarkJoinParameters:
BenchmarkFinalParameters:
- ProblemSizes:
- Exact: [4096, 3072, 1, 8192]
- Range: [[256], [192], [1], [64, 64, 256]]
- Range: [[256], [192], [1], [1,1,64]]
- Range: [[256], [192], [1], [32, 64, 256]]
- Range: [[8192], [6144], [1], [64, 64, 256]]
- BiasTypeArgs: ['b']
########################################
# HHS NN - standard
########################################
Expand Down
Loading