Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,389 @@
- {MinimumRequiredVersion: 4.33.0}
- gfx1200
- gfx1200
- [Device 73f0]
- Activation: true
ActivationComputeDataType: 0
ActivationNoGuard: false
ActivationType: hipblaslt_all
AllowNoFreeDims: false
AssignedDerivedParameters: true
Batched: true
BetaOnlyUseBias: false
BiasDataTypeList: [0]
BiasSrc: D
ComplexConjugateA: false
ComplexConjugateB: false
ComputeDataType: 0
DataType: 4
DataTypeA: 4
DataTypeAmaxD: 0
DataTypeB: 4
DataTypeE: 4
DestDataType: 4
F32XdlMathOp: 0
Gradient: false
GroupedGemm: false
HighPrecisionAccumulate: true
Index0: 0
Index01A: 0
Index01B: 1
Index1: 1
IndexAssignmentsA: [0, 3, 2]
IndexAssignmentsB: [1, 3, 2]
IndexAssignmentsLD: [4, 5, 6, 7]
IndexAssignmentsMetadata: [3, 0, 2]
IndexUnroll: 3
IndexUnrollA: 1
IndexUnrollB: 1
IndexUnrollM: 0
IndicesBatch: [2]
IndicesFree: [0, 1]
IndicesSummation: [3]
MirrorDimsA: []
MirrorDimsB: []
MirrorDimsMetadata: []
NumIndicesBatch: 1
NumIndicesC: 3
NumIndicesFree: 2
NumIndicesLD: 4
NumIndicesSummation: 1
OperationType: GEMM
OutputAmaxD: false
SetConstStrideA: []
SetConstStrideB: []
SetConstStrideBias: []
SilentHighPrecisionAccumulate: false
Sparse: 0
StochasticRounding: false
StridedBatched: true
SupportUserArgs: true
SwizzleTensorA: false
SwizzleTensorB: false
TLUA: true
TLUB: true
Tensor0: 0
Tensor1: 1
TileA: 0
TileAwareSelection: false
TileB: 1
TotalIndices: 4
TransposeA: 0
TransposeB: 1
UseBeta: true
UseBias: 1
UseE: true
UseInitialStridesAB: false
UseInitialStridesCD: false
UseScaleAB: ''
UseScaleAlphaVec: 1
UseScaleCD: false
- - 1LDSBuffer: 0
ActivationAlt: false
ActivationFuncCall: true
ActivationFused: true
AssertAIGreaterThanEqual: -1
AssertAILessThanEqual: -1
AssertFree0ElementMultiple: 1
AssertFree1ElementMultiple: 1
AssertSummationElementMultiple: 1
AssignedDerivedParameters: true
AssignedProblemIndependentDerivedParameters: true
BaseName: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserAKgPe_Wv9DOQxiTMf0uqv29biFXQUfeD7e0IgqktbPPA=
BufferLoad: true
BufferStore: true
CUCount: null
CUOccupancy: -1
ClusterLocalRead: 0
CodeObjectVersion: '4'
ConvertAfterDS: false
CustomKernelName: ''
DebugStreamK: 0
DepthU: 32
DirectToLds: false
DirectToLdsA: false
DirectToLdsB: false
DirectToVgprA: 0
DirectToVgprB: 0
DirectToVgprSparseMetadata: false
EdgeType: ShiftPtr
EnableF32XEmulationLds: false
EnableF32XdlMathOp: false
EnableMatrixInstruction: true
ExpandPointerSwap: 0
ExpertSchedulingMode: 2
ForceDisableShadowInit: false
GlobalReadPerMfma: 1
GlobalReadVectorWidthA: 4
GlobalReadVectorWidthB: 1
GlobalSplitU: 1
GlobalSplitUAlgorithm: MultipleBuffer
GlobalSplitUCoalesced: false
GlobalSplitUWorkGroupMappingRoundRobin: false
GlobalWriteVectorWidth: 1
GroupLoadStore: false
GuaranteeNoPartialA: false
GuaranteeNoPartialB: true
GuaranteeNoPartialMetadata: true
ISA: [12, 0, 0]
InnerUnroll: 1
InterleaveAlpha: 0
InternalSupportParams: {KernArgsVersion: 2, SupportCustomStaggerU: true, SupportCustomWGM: true,
SupportUserGSU: true, UseUniversalArgs: true}
Kernel: true
KernelLanguage: Assembly
KernelNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSUAMB_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1
LDSTrInst: false
LSCA: 16
LSCB: 16
LSPA: 8
LSPB: 2
LVCA: 4
LVCB: 16
LVPA: 2
LVPB: 2
LdsBlockSizePerPadA: 128
LdsBlockSizePerPadB: 128
LdsBlockSizePerPadMetadata: 0
LdsBytesNoAmax: 6656
LdsInitCVgprs: false
LdsNumBytes: 6656
LdsNumElementsAlignedA: 1280
LdsNumElementsAlignedB: 1280
LdsNumElementsAlignedMetadata: 0
LdsOffsetA: 0
LdsOffsetA_Blk: 4096
LdsOffsetB: 1280
LdsOffsetB_Blk: 5376
LdsOffsetBias: 0
LdsOffsetBiasGSU: 0
LdsOffsetBiasNonGSU: 0
LdsOffsetMetadata: 1280
LdsOffsetMetadata_Blk: 5376
LdsPadA: 16
LdsPadB: 16
LdsPadMetadata: 0
LocalReadVectorWidth: 8
LocalSplitU: 1
LocalSplitUReuseLDS: 1
LocalWritePerMfma: -1
LocalWriteUseSgprA: false
LocalWriteUseSgprB: false
LoopIters: 2
LoopUnroll: 32
MFMA_BF16_1K: 0
MIArchVgpr: true
MIBlock: [16, 16, 16, 1, 1, 1]
MIInputPerThread: 8
MIInputPerThreadA: 8
MIInputPerThreadB: 8
MIInputPerThreadMetadata: 8
MIOutputVectorWidth: 8
MIRegPerOut: 1
MIWaveGroup: [1, 1]
MIWaveTile: [1, 1]
MIWaveTileA: 1
MIWaveTileB: 1
MIWaveTileMetadata: 0
MacroTile0: 16
MacroTile1: 16
MacroTileA: 16
MacroTileB: 16
MagicDivAlg: 2
MathClocksUnrolledLoop: 0
MatrixInstB: 1
MatrixInstBM: 1
MatrixInstBN: 1
MatrixInstK: 16
MatrixInstM: 16
MatrixInstN: 16
MatrixInstruction: [16, 16, 16, 1]
MaxLDS: 65536
MaxOccupancy: 40
MbskPrefetchOpt: 0
NoLdsWriteCode: false
NoReject: false
NoTailLoop: false
NonDTLTailLoopA: false
NonDTLTailLoopB: false
NonTemporal: -1
NonTemporalA: 0
NonTemporalB: 0
NonTemporalC: 0
NonTemporalD: 0
NonTemporalE: 0
NonTemporalMetadata: 0
NonTemporalWS: 0
NumElementsPerBatchStore: 0
NumElementsPerThread: 8
NumGlobalWriteVectorsPerThread: 8
NumLoadsA: 4
NumLoadsB: 16
NumLoadsCoalescedA: 1
NumLoadsCoalescedB: 1
NumLoadsPerpendicularA: 4
NumLoadsPerpendicularB: 16
NumThreads: 32
NumWaveSplitK: 1
OptNoLoadLoop: 1
PackedC0IdxChars: [I]
PackedC0IndicesX: [0]
PackedC1IdxChars: [J]
PackedC1IndicesX: [1]
PrefetchGlobalRead: 2
PrefetchLocalRead: 0
PreloadKernArgs: 0
ProblemType:
Activation: true
ActivationComputeDataType: 0
ActivationNoGuard: false
ActivationType: hipblaslt_all
AllowNoFreeDims: false
AssignedDerivedParameters: true
Batched: true
BetaOnlyUseBias: false
BiasDataTypeList: [0]
BiasSrc: D
ComplexConjugateA: false
ComplexConjugateB: false
ComputeDataType: 0
DataType: 4
DataTypeA: 4
DataTypeAmaxD: 0
DataTypeB: 4
DataTypeE: 4
DestDataType: 4
F32XdlMathOp: 0
Gradient: false
GroupedGemm: false
HighPrecisionAccumulate: true
Index0: 0
Index01A: 0
Index01B: 1
Index1: 1
IndexAssignmentsA: [0, 3, 2]
IndexAssignmentsB: [1, 3, 2]
IndexAssignmentsLD: [4, 5, 6, 7]
IndexAssignmentsMetadata: [3, 0, 2]
IndexUnroll: 3
IndexUnrollA: 1
IndexUnrollB: 1
IndexUnrollM: 0
IndicesBatch: [2]
IndicesFree: [0, 1]
IndicesSummation: [3]
MirrorDimsA: []
MirrorDimsB: []
MirrorDimsMetadata: []
NumIndicesBatch: 1
NumIndicesC: 3
NumIndicesFree: 2
NumIndicesLD: 4
NumIndicesSummation: 1
OperationType: GEMM
OutputAmaxD: false
SetConstStrideA: []
SetConstStrideB: []
SetConstStrideBias: []
SilentHighPrecisionAccumulate: false
Sparse: 0
StochasticRounding: false
StridedBatched: true
SupportUserArgs: true
SwizzleTensorA: false
SwizzleTensorB: false
TLUA: true
TLUB: true
Tensor0: 0
Tensor1: 1
TileA: 0
TileAwareSelection: false
TileB: 1
TotalIndices: 4
TransposeA: 0
TransposeB: 1
UseBeta: true
UseBias: 1
UseE: true
UseInitialStridesAB: false
UseInitialStridesCD: false
UseScaleAB: ''
UseScaleAlphaVec: 1
UseScaleCD: false
ScheduleGlobalRead: 1
ScheduleIterAlg: 3
ScheduleLocalWrite: 1
SolutionIndex: 0
SolutionNameMin: Cijk_Ailk_Bjlk_HHS_BH_Bias_S_AuxH_HA_S_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_LDSB0_AFC1_AFEM1_AFEM1_ASEM1_CLR0_CADS0_DTVA0_DTVB0_EPS0_FDSI0_GRPM1_GRVWA4_GRVWB1_GSU1_GSUAMB_GSUC0_GSUWGMRR0_GLS0_ISA1200_IU1_K1_LBSPPA128_LBSPPB128_LBSPPM0_LPA16_LPB16_LPM0_LRVW8_LWPMn1_MIAV1_MIWT1_1_MO40_NTn1_NTA0_NTB0_NTC0_NTD0_NTM0_NEPBS0_NLCA1_NLCB1_ONLL1_PGR2_PLR0_PKA0_SIA3_SS0_SU32_SUM0_SUS256_SPO0_SRVW0_SSO0_SVW8_SK0_SKXCCM0_TLDS2_ULSGRO0_USL1_UIOFGRO0_USFGROn1_VSn1_VWA1_VWB1_WSGRA0_WSGRB0_WS32_WG16_2_1_WGM8_WGMXCC1_WGMXCCGn1
SourceSwap: 0
StaggerU: 32
StaggerUMapping: 0
StaggerUStride: 256
StorePriorityOpt: false
StoreRemapVectorWidth: 0
StoreSwapAddr: false
StoreSyncOpt: 0
StoreVectorWidth: 8
StreamK: 0
StreamKAtomic: 0
StreamKXCCMapping: 0
SubGroup0: 2
SubGroup1: 16
SubGroupA: 2
SubGroupB: 16
SuppressNoLoadLoop: false
ThreadTile: [1, 1]
ThreadTile0: 8
ThreadTile1: 1
ThreadTileA: 8
ThreadTileB: 1
TransposeLDS: 2
TransposeLDSMetadata: true
ULSGRODoubleG2L: 0
UnrollLoopSwapGlobalReadOrder: 0
UnrollMajorLDSA: 1
UnrollMajorLDSB: 1
UnrollMajorLDSMetadata: true
Use64bShadowLimit: 1
UseDotInstruction: false
UseF32XEmulation: false
UseInstOffsetForGRO: 0
UseSgprForGRO: -1
Valid: true
VectorStore: -1
VectorWidthA: 1
VectorWidthB: 1
WaveSeparateGlobalReadA: 0
WaveSeparateGlobalReadB: 0
WaveSeparateGlobalReadMetadata: 0
WaveSplitK: false
WavefrontSize: 32
WorkGroup: [16, 2, 1]
WorkGroupMapping: 8
WorkGroupMappingXCC: 1
WorkGroupMappingXCCGroup: -1
WorkGroupReduction: false
WorkspaceCheck: [4, 0, 1]
_DepthU: 32
_DepthUA: 32
_DepthUB: 32
_DepthUMetadata: 32
_GlobalAccumulation: MultipleBuffer
_UseSgprForGRO: false
_VectorStore: 1
_WorkspaceSizePerElemBias: 0
_WorkspaceSizePerElemC: 4
_staggerStrideShift: 2
enableLDSTrA: false
enableLDSTrB: false
reorderGRInstForDTVA: false
reorderGRInstForDTVB: false
tailLoopOptA: false
tailLoopOptB: false
- [2, 3, 0, 1]
- - - [128, 128, 1, 128, 128, 128, 128, 128]
- [0, 0.06]
- null
- null
- DeviceEfficiency
- Equality
Loading
Loading