Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions clients/gtest/matmul_gtest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,48 @@ Tests:
unit_check: 1
gpu_arch: '94[0-2]'

#TODO: extend to all f8 transpose and datatype if necessary
- name: matmul_real_f8_dst_f8_TT
category: pre_checkin
function:
matmul: *f8_precision_dst_f8
matrix_size:
- { M: 1, N: 1, K: 1 }
- { M: 1, N: 1, K: 127 }
- { M: 1, N: 127, K: 127 }
- { M: 2, N: 127, K: 127 }
- { M: 3, N: 127, K: 127 }
- { M: 127, N: 1, K: 127 }
- { M: 127, N: 2, K: 127 }
- { M: 127, N: 3, K: 127 }
- { M: 1, N: 1, K: 128 }
- { M: 1, N: 128, K: 128 }
- { M: 2, N: 128, K: 128 }
- { M: 3, N: 128, K: 128 }
- { M: 128, N: 1, K: 128 }
- { M: 128, N: 2, K: 128 }
- { M: 128, N: 3, K: 128 }
- { M: 1, N: 1, K: 129 }
- { M: 1, N: 129, K: 129 }
- { M: 2, N: 129, K: 129 }
- { M: 3, N: 129, K: 129 }
- { M: 129, N: 1, K: 129 }
- { M: 129, N: 2, K: 129 }
- { M: 129, N: 3, K: 129 }
transA: T
transB: T
alpha: 1
beta: [ 0.0, 2.0 ]
scaleA: [0, 1]
scaleB: [0, 1]
scaleC: [0]
scaleD: [0]
bias_vector: [0, 1]
bias_type: f32_r
unit_check: 1
gpu_arch: '942'


- name: matmul_real_1b_dst_f8_SCDNotInt
category: pre_checkin
function:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
UseE: false
UseInitialStridesAB: false
UseInitialStridesCD: false
UseScaleAB: "Scalar"
UseScaleAB: Scalar
UseScaleAlphaVec: 1
UseScaleCD: true
- - 1LDSBuffer: 0
Expand Down Expand Up @@ -275,7 +275,7 @@
UseE: false
UseInitialStridesAB: false
UseInitialStridesCD: false
UseScaleAB: "Scalar"
UseScaleAB: Scalar
UseScaleAlphaVec: 1
UseScaleCD: true
ScheduleGlobalRead: 1
Expand Down Expand Up @@ -534,7 +534,7 @@
UseE: false
UseInitialStridesAB: false
UseInitialStridesCD: false
UseScaleAB: "Scalar"
UseScaleAB: Scalar
UseScaleAlphaVec: 1
UseScaleCD: true
ScheduleGlobalRead: 1
Expand Down Expand Up @@ -793,7 +793,7 @@
UseE: false
UseInitialStridesAB: false
UseInitialStridesCD: false
UseScaleAB: "Scalar"
UseScaleAB: Scalar
UseScaleAlphaVec: 1
UseScaleCD: true
ScheduleGlobalRead: 1
Expand Down Expand Up @@ -850,15 +850,283 @@
_WorkspaceSizePerElemBias: 0
_WorkspaceSizePerElemC: 4
_staggerStrideShift: 2
- 1LDSBuffer: 0
ActivationAlt: false
ActivationFuncCall: true
ActivationFused: true
AssertFree0ElementMultiple: 1
AssertFree1ElementMultiple: 1
AssertSummationElementMultiple: 1
AssignedDerivedParameters: true
AssignedProblemIndependentDerivedParameters: true
BufferLoad: true
BufferStore: true
CUCount: null
ClusterLocalRead: 0
CodeObjectVersion: default
ConvertAfterDS: false
CustomKernelName: ''
DepthU: 32
DirectToLds: false
DirectToLdsA: false
DirectToLdsB: false
DirectToVgprSparseMetadata: false
EdgeType: ShiftPtr
EnableF32XdlMathOp: false
EnableMatrixInstruction: true
ExpandPointerSwap: 0
ForceDisableShadowInit: false
GlobalReadPerMfma: 1
GlobalReadVectorWidthA: 8
GlobalReadVectorWidthB: 1
GlobalSplitU: 1
GlobalSplitUAlgorithm: MultipleBuffer
GlobalSplitUCoalesced: false
GlobalWriteVectorWidth: 1
GroupLoadStore: false
GuaranteeNoPartialA: true
GuaranteeNoPartialB: true
GuaranteeNoPartialMetadata: true
ISA: [9, 4, 2]
InnerUnroll: 1
InterleaveAlpha: 0
InternalSupportParams: {KernArgsVersion: 1, SupportCustomStaggerU: true, SupportCustomWGM: true,
SupportUserGSU: true, UseUniversalArgs: true}
KernelLanguage: Assembly
KernelNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSH_AS_SAB_SCD_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_MIWT1_1
LSCA: 32
LSCB: 16
LSPA: 16
LSPB: 4
LVCA: 4
LVCB: 16
LVPA: 2
LVPB: 4
LdsBlockSizePerPadA: 128
LdsBlockSizePerPadB: 128
LdsBlockSizePerPadMetadata: 0
LdsInitCVgprs: false
LdsNumBytes: 3392
LdsNumElementsAlignedA: 768
LdsNumElementsAlignedB: 768
LdsNumElementsAlignedMetadata: 0
LdsOffsetA: 0
LdsOffsetA_Blk: 2048
LdsOffsetB: 768
LdsOffsetB_Blk: 2816
LdsOffsetBias: 0
LdsOffsetBiasGSU: 0
LdsOffsetBiasNonGSU: 0
LdsOffsetMetadata: 768
LdsOffsetMetadata_Blk: 2816
LdsPadA: 8
LdsPadB: 16
LdsPadMetadata: 0
LocalReadVectorWidth: 8
LocalSplitU: 1
LocalWritePerMfma: -1
LocalWriteUseSgprA: false
LocalWriteUseSgprB: false
LoopIters: 1
LoopUnroll: 32
MFMA_BF16_1K: false
MIArchVgpr: false
MIBlock: [16, 16, 32, 1, 1, 1]
MIInputPerThread: 8
MIInputPerThreadA: 8
MIInputPerThreadB: 8
MIInputPerThreadMetadata: 8
MIOutputVectorWidth: 4
MIRegPerOut: 1
MIWaveGroup: [1, 1]
MIWaveTile: [1, 1]
MIWaveTileA: 1
MIWaveTileB: 1
MIWaveTileMetadata: 0
MacroTile0: 16
MacroTile1: 16
MacroTileA: 16
MacroTileB: 16
MagicDivAlg: 2
MatrixInstB: 1
MatrixInstBM: 1
MatrixInstBN: 1
MatrixInstK: 32
MatrixInstM: 16
MatrixInstN: 16
MatrixInstruction: [16, 16, 32, 1]
MaxOccupancy: 40
MaxVgprNumber: 256
MinVgprNumber: 0
NoLdsWriteCode: false
NoReject: false
NoTailLoop: false
NonTemporal: -1
NonTemporalA: 0
NonTemporalB: 0
NonTemporalC: 0
NonTemporalD: 0
NonTemporalE: 0
NonTemporalMetadata: 0
NumElementsPerBatchStore: 16
NumElementsPerThread: 4
NumGlobalWriteVectorsPerThread: 4
NumLoadsA: 1
NumLoadsB: 8
NumLoadsCoalescedA: 1
NumLoadsCoalescedB: 1
NumLoadsPerpendicularA: 1
NumLoadsPerpendicularB: 8
NumThreads: 64
OptNoLoadLoop: 0
PackedC0IdxChars: [I]
PackedC0IndicesX: [0]
PackedC1IdxChars: [J]
PackedC1IndicesX: [1]
PrefetchGlobalRead: 2
PrefetchLocalRead: 0
PreloadKernArgs: true
ProblemType:
Activation: true
ActivationComputeDataType: 0
ActivationNoGuard: false
ActivationType: all
AllowNoFreeDims: false
AssignedDerivedParameters: true
Batched: true
BetaOnlyUseBias: false
BiasDataTypeList: [0, 4]
BiasSrc: D
ComplexConjugateA: false
ComplexConjugateB: false
ComputeDataType: 0
DataType: 11
DataTypeA: 11
DataTypeB: 11
DataTypeE: 11
DestDataType: 11
F32XdlMathOp: 0
Gradient: false
GroupedGemm: false
HighPrecisionAccumulate: true
Index0: 0
Index01A: 0
Index01B: 1
Index1: 1
IndexAssignmentsA: [3, 0, 2]
IndexAssignmentsB: [1, 3, 2]
IndexAssignmentsLD: [4, 5, 6, 7]
IndexAssignmentsMetadata: [3, 0, 2]
IndexUnroll: 3
IndexUnrollA: 0
IndexUnrollB: 1
IndexUnrollM: 0
IndicesBatch: [2]
IndicesFree: [0, 1]
IndicesSummation: [3]
MirrorDimsA: []
MirrorDimsB: []
MirrorDimsMetadata: []
NumIndicesBatch: 1
NumIndicesC: 3
NumIndicesFree: 2
NumIndicesLD: 4
NumIndicesSummation: 1
OperationType: GEMM
SetConstStrideA: []
SetConstStrideB: []
SetConstStrideBias: []
SilentHighPrecisionAccumulate: false
Sparse: 0
StochasticRounding: false
StridedBatched: true
SupportUserArgs: true
TLUA: false
TLUB: true
Tensor0: 0
Tensor1: 1
TileA: 0
TileAwareSelection: false
TileB: 1
TotalIndices: 4
TransposeA: 1
TransposeB: 1
UseBeta: true
UseBias: 1
UseE: false
UseInitialStridesAB: false
UseInitialStridesCD: false
UseScaleAB: Scalar
UseScaleAlphaVec: 1
UseScaleCD: true
ScheduleGlobalRead: 1
ScheduleIterAlg: 3
ScheduleLocalWrite: 1
SolutionIndex: 3
SolutionNameMin: Cijk_Alik_Bjlk_F8F8S_BH_BiasSH_AS_SAB_SCD_SAV_UserArgs_MT16x16x32_MI16x16x1_SN_GSU1_MIWT1_1_SU0_SUM0_SUS0_WGM1
SourceSwap: 1
StaggerU: 0
StaggerUMapping: 0
StaggerUStride: 0
StorePriorityOpt: false
StoreRemapVectorWidth: 0
StoreSyncOpt: 0
StoreVectorWidth: 1
SubGroup0: 4
SubGroup1: 16
SubGroupA: 4
SubGroupB: 16
SuppressNoLoadLoop: false
ThreadTile: [1, 1]
ThreadTile0: 4
ThreadTile1: 1
ThreadTileA: 4
ThreadTileB: 1
TransposeLDS: 1
TransposeLDSMetadata: true
ULSGRODoubleG2L: 0
UnrollLoopSwapGlobalReadOrder: 0
UnrollMajorLDSA: true
UnrollMajorLDSB: false
UnrollMajorLDSMetadata: true
Use64bShadowLimit: 1
UseInstOffsetForGRO: 0
UseSgprForGRO: -1
Valid: true
VectorStore: -1
VectorWidthA: 1
VectorWidthB: 1
WaveSeparateGlobalReadA: 0
WaveSeparateGlobalReadB: 0
WaveSeparateGlobalReadMetadata: 0
WavefrontSize: 64
WorkGroup: [16, 4, 1]
WorkGroupMapping: 1
WorkGroupMappingXCC: 1
WorkGroupMappingXCCGroup: 0
WorkGroupReduction: false
WorkspaceCheck: [4, 0, 1]
_DepthU: 32
_DepthUA: 32
_DepthUB: 32
_DepthUMetadata: 32
_GlobalAccumulation: MultipleBuffer
_UseSgprForGRO: 1
_VectorStore: 1
_WorkspaceSizePerElemBias: 0
_WorkspaceSizePerElemC: 4
_staggerStrideShift: 0
- [2, 3, 0, 1]
- - - [1, 128, 1, 128, 1, 1, 128, 128]
- - - [1, 128, 1, 128]
- [2, 2.80861]
- - [128, 128, 1, 128, 128, 128, 128, 128]
- - [128, 128, 1, 128]
- [0, 237.234]
- - [127, 127, 1, 128, 127, 127, 128, 127]
- - [127, 127, 1, 128]
- [1, 220.532]
- - [129, 129, 1, 128, 129, 129, 129, 129]
- - [129, 129, 1, 128]
- [2, 217.741]
- - [3, 3, 1, 3]
- [3, 0.0]
- null
- null
- DeviceEfficiency
Expand Down
Loading