Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26328,20 +26328,20 @@
AssertSummationElementMultiple: 1
AssignedDerivedParameters: true
AssignedProblemIndependentDerivedParameters: true
BaseName: Cijk_Ailk_Bljk_BBS_BH_BiasSB_HAS_SAV_UserArgs_MT192x224x64_MI16fwZD_SxDnnVOKmLpTDrPrhupLHmkAIN1IXhJjWIM824=
BaseName: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT1J2ed5Fhxf8DLJDiYbpPN48d6A8VIyjP0ZhnMLknaejw=
BufferLoad: true
BufferStore: true
CUCount: null
CUOccupancy: -1
ClusterLocalRead: 0
CodeObjectVersion: 4
ClusterLocalRead: 1
CodeObjectVersion: '4'
ConvertAfterDS: false
CustomKernelName: ''
DebugStreamK: 0
DepthU: 64
DirectToLds: 0
DirectToLds: true
DirectToLdsA: false
DirectToLdsB: false
DirectToLdsB: true
DirectToVgprA: false
DirectToVgprB: false
DirectToVgprSparseMetadata: false
Expand Down Expand Up @@ -26370,7 +26370,7 @@
SupportUserGSU: false, UseUniversalArgs: true}
Kernel: true
KernelLanguage: Assembly
KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA1_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1
KernelNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1
LDSTrInst: 1
LSCA: 64
LSCB: 64
Expand All @@ -26381,13 +26381,13 @@
LVPA: 4
LVPB: 4
LdsBlockSizePerPadA: 3072
LdsBlockSizePerPadB: 128
LdsBlockSizePerPadB: 1024
LdsBlockSizePerPadMetadata: 0
LdsBytesNoAmax: 126208
LdsBytesNoAmax: 124160
LdsInitCVgprs: false
LdsNumBytes: 126208
LdsNumBytes: 124160
LdsNumElementsAlignedA: 24832
LdsNumElementsAlignedB: 35840
LdsNumElementsAlignedB: 33792
LdsNumElementsAlignedMetadata: 0
LdsOffsetA: 0
LdsOffsetA_Blk: 65536
Expand All @@ -26406,11 +26406,11 @@
LocalSplitUReuseLDS: 1
LocalWritePerMfma: -1
LocalWriteUseSgprA: false
LocalWriteUseSgprB: false
LocalWriteUseSgprB: true
LoopIters: 2
LoopUnroll: 64
MFMA_BF16_1K: false
MIArchVgpr: 0
MIArchVgpr: false
MIBlock: [16, 16, 32, 1, 1, 1]
MIInputPerThread: 8
MIInputPerThreadA: 8
Expand All @@ -26419,14 +26419,14 @@
MIOutputVectorWidth: 4
MIRegPerOut: 1
MIWaveGroup: [2, 2]
MIWaveTile: [6, 7]
MIWaveTile: [6, 8]
MIWaveTileA: 6
MIWaveTileB: 7
MIWaveTileB: 8
MIWaveTileMetadata: 0
MacroTile0: 192
MacroTile1: 224
MacroTile1: 256
MacroTileA: 192
MacroTileB: 224
MacroTileB: 256
MagicDivAlg: 2
MathClocksUnrolledLoop: 0
MatrixInstB: 1
Expand All @@ -26439,29 +26439,29 @@
MaxLDS: 163840
MaxOccupancy: 40
MbskPrefetchMethod: 0
MfmaInitCVgprs: false
MfmaInitCVgprs: true
NoLdsWriteCode: false
NoReject: false
NoTailLoop: false
NonDTLTailLoopA: true
NonDTLTailLoopB: true
NonTemporal: -1
NonTemporalA: 1
NonTemporalA: 4
NonTemporalB: 0
NonTemporalC: 0
NonTemporalD: 4
NonTemporalE: 0
NonTemporalMetadata: 0
NonTemporalWS: 0
NumElementsPerBatchStore: 0
NumElementsPerThread: 168
NumGlobalWriteVectorsPerThread: 168
NumElementsPerThread: 192
NumGlobalWriteVectorsPerThread: 192
NumLoadsA: 6
NumLoadsB: 7
NumLoadsB: 8
NumLoadsCoalescedA: 3
NumLoadsCoalescedB: 1
NumLoadsPerpendicularA: 2
NumLoadsPerpendicularB: 7
NumLoadsPerpendicularB: 8
NumThreads: 256
NumWaveSplitK: 1
OptNoLoadLoop: 1
Expand All @@ -26472,27 +26472,22 @@
PrefetchGlobalRead: 2
PrefetchLocalRead: 1
PreloadKernArgs: true
SFCWGM:
- [1, 1]
- [1, 1]
ScheduleGlobalRead: 1
ScheduleIterAlg: 3
ScheduleLocalWrite: 1
SolutionIndex: 112
SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x224x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTLA0_DTLB0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LDSTI1_LBSPPA3072_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_7_MO40_NTn1_NTA1_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU16_SUM0_SUS512_SPO1_SRVW0_SSO1_SVW1_SK3_SKFTR0_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM24_WGMXCC2_WGMXCCGn1
SolutionNameMin: Cijk_Ailk_Bljk_BBS_BH_Bias_HA_S_SAV_UserArgs_MT192x256x64_MI16x16x1_CMS_SN_LDSB0_AFC0_AFEM1_AFEM1_ASEM1_CLR1_CADS0_DTLA0_DTLB1_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA8_GRVWB8_GSU0_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA950_IU1_K1_LBSPPA3072_LBSPPB1024_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV0_MIWT6_8_MO40_NTn1_NTA0_NTB0_NTC0_NTD4_NTM0_NEPBS0_NLCA3_NLCB1_ONLL1_PGR2_PLR1_PKA1_SIA3_SS1_SU0_SUM0_SUS128_SPO0_SRVW0_SSO0_SVW1_SK3_SKXCCM0_TLDS1_ULSGRO0_USL1_UIOFGRO0_USFGRO0_VSn1_VWA1_VWB8_WSGRA0_WSGRB0_WS64_WG32_8_1_WGM16_WGMXCC2_WGMXCCGn1
SourceSwap: 1
SpaceFillingAlgo: []
StaggerU: 16
StaggerU: 0
StaggerUMapping: 0
StaggerUStride: 512
StorePriorityOpt: 1
StaggerUStride: 128
StorePriorityOpt: false
StoreRemapVectorWidth: 0
StoreSwapAddr: false
StoreSyncOpt: 1
StoreSyncOpt: 0
StoreVectorWidth: 1
StreamK: 3
StreamKAtomic: 0
StreamKFixupTreeReduction: 0
StreamKXCCMapping: 0
SubGroup0: 8
SubGroup1: 32
Expand All @@ -26502,9 +26497,9 @@
SwapGlobalReadOrder: false
ThreadTile: [1, 1]
ThreadTile0: 24
ThreadTile1: 7
ThreadTile1: 8
ThreadTileA: 24
ThreadTileB: 7
ThreadTileB: 8
TransposeLDS: 1
TransposeLDSMetadata: true
ULSGRODoubleG2L: 0
Expand All @@ -26513,24 +26508,24 @@
UnrollMajorLDSB: true
UnrollMajorLDSMetadata: true
Use64bShadowLimit: 1
UseCustomMainLoopSchedule: 1
UseCustomMainLoopSchedule: true
UseDot2F32XEmulation: true
UseDotInstruction: false
UseF32XEmulation: false
UseInstOffsetForGRO: 0
UsePLRPack: false
UseSgprForGRO: 1
UseSgprForGRO: 0
Valid: true
VectorStore: -1
VectorWidthA: 1
VectorWidthB: 1
VectorWidthB: 8
WaveSeparateGlobalReadA: 0
WaveSeparateGlobalReadB: 0
WaveSeparateGlobalReadMetadata: 0
WaveSplitK: false
WavefrontSize: 64
WorkGroup: [32, 8, 1]
WorkGroupMapping: 24
WorkGroupMapping: 16
WorkGroupMappingXCC: 2
WorkGroupMappingXCCGroup: -1
WorkGroupReduction: false
Expand All @@ -26540,13 +26535,11 @@
_DepthUB: 64
_DepthUMetadata: 64
_GlobalAccumulation: PartialsBuffer
_UseSgprForGRO: false
_UseSgprForGRO: 0
_VectorStore: 1
_WorkspaceSizePerElemBias: 0
_WorkspaceSizePerElemC: 4
_staggerStrideShift: 2
enableGLTrA: false
enableGLTrB: false
_staggerStrideShift: 0
enableLDSTrA: true
enableLDSTrB: false
reorderGRInstForDTVA: false
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ BenchmarkProblems:
BenchmarkFinalParameters:
- ProblemSizes:
- Range: [[255, 1, 257], [255, 1, 257], [1], [1, 37, 192]]
- Exact: [3264, 3072, 1, 1536]
- BiasTypeArgs: ['b']
- # BenchmarkProblemSizeGroup - Standard - All problem
InitialSolutionParameters:
Expand Down
Loading