Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,310 @@
- {MinimumRequiredVersion: 4.33.0}
- strixhalo
- gfx1151
- [Device 1586]
- AllowNoFreeDims: false
AssignedDerivedParameters: true
Batched: true
ComplexConjugateA: false
ComplexConjugateB: false
ComputeDataType: 0
ConvolutionConfig: []
DataType: 0
DestDataType: 0
Fp16AltImpl: false
HighPrecisionAccumulate: false
Index0: 0
Index01A: 0
Index01B: 1
Index1: 1
IndexAssignmentsA: [0, 3, 2]
IndexAssignmentsB: [1, 3, 2]
IndexAssignmentsLD: [4, 5, 6, 7]
IndexUnroll: 3
IndexUnrollA: 1
IndexUnrollB: 1
IndicesBatch: [2]
IndicesFree: [0, 1]
IndicesSummation: [3]
MirrorDimsA: []
MirrorDimsB: []
NumIndicesBatch: 1
NumIndicesC: 3
NumIndicesFree: 2
NumIndicesLD: 4
NumIndicesSummation: 1
OperationType: GEMM
SetConstStrideA: []
SetConstStrideB: []
SilentHighPrecisionAccumulate: false
StridedBatched: true
TLUA: true
TLUB: true
Tensor0: 0
Tensor1: 1
TileA: 0
TileAwareSelection: false
TileB: 1
TotalIndices: 4
TransposeA: false
TransposeB: true
UseBeta: true
UseInitialStridesAB: false
UseInitialStridesCD: false
ZeroPadA: []
ZeroPadB: []
- - 1LDSBuffer: 0
AggressivePerfMode: 1
AssertAlphaValue: false
AssertBetaValue: false
AssertCEqualsD: false
AssertFree0ElementMultiple: 1
AssertFree1ElementMultiple: 1
AssertMinApproxSize: 0
AssertSizeEqual: {}
AssertSizeGreaterThan: {}
AssertSizeLessThan: {}
AssertSizeMultiple: {}
AssertStrideAEqual: {0: 1}
AssertStrideBEqual: {0: 1}
AssertStrideCEqual: {0: 1}
AssertStrideDEqual: {0: 1}
AssertSummationElementMultiple: 1
AssignedDerivedParameters: true
AssignedProblemIndependentDerivedParameters: true
AtomicAddC: false
BufferLoad: true
BufferStore: true
CheckDimOverflow: 0
CheckTensorDimAsserts: false
CodeObjectVersion: default
CustomKernelName: ''
DepthU: 8
DepthULdsDivisor: 1
DirectToLds: false
DirectToLdsA: false
DirectToLdsB: false
DirectToVgprA: false
DirectToVgprB: false
DisableAtomicFail: 0
DisableKernelPieces: 0
DisableVgprOverlapping: false
EdgeType: ShiftPtr
EnableMatrixInstruction: false
ExpandPointerSwap: 0
Fp16AltImpl: false
FractionalLoad: 0
GlobalLoadVectorWidthA: 1
GlobalLoadVectorWidthB: 1
GlobalRead2A: true
GlobalRead2B: true
GlobalReadCoalesceGroupA: true
GlobalReadCoalesceGroupB: true
GlobalReadCoalesceVectorA: true
GlobalReadCoalesceVectorB: true
GlobalReadPerMfma: 1
GlobalReadVectorWidth: 1
GlobalSplitU: 1
GlobalSplitUAlgorithm: SingleBuffer
GlobalSplitUSummationAssignmentRoundRobin: true
GlobalSplitUWorkGroupMappingRoundRobin: false
GlobalWriteVectorWidth: 1
GroupLoadStore: false
GuaranteeNoPartialA: true
GuaranteeNoPartialB: true
ISA: [11, 5, 0]
InnerUnroll: 1
InterleaveAlpha: 0
KernelLanguage: Assembly
LSCA: 32
LSCB: 32
LSPA: 8
LSPB: 8
LVCA: 32
LVCB: 32
LVPA: 8
LVPB: 8
LdcEqualsLdd: false
LdsBlockSizePerPad: 0
LdsBlockSizePerPadA: 0
LdsBlockSizePerPadB: 0
LdsInitCVgprs: false
LdsNumElements: 512
LdsOffsetA: 0
LdsOffsetB: 256
LdsPadA: 0
LdsPadB: 0
LocalDotLayout: 1
LocalRead2A: true
LocalRead2B: true
LocalReadVectorWidth: 1
LocalSplitU: 1
LocalWrite2A: true
LocalWrite2B: true
LocalWritePerMfma: -1
LocalWriteUseSgprA: false
LocalWriteUseSgprB: false
LoopDoWhile: false
LoopIters: 8
LoopTail: true
LoopUnroll: 8
MACInstruction: FMA
MIArchVgpr: false
MacroTile0: 32
MacroTile1: 32
MacroTileA: 32
MacroTileB: 32
MacroTileShapeMax: 64
MacroTileShapeMin: 1
MagicDivAlg: 2
MatrixInstruction: []
MaxOccupancy: 40
MaxVgprNumber: 256
MinVgprNumber: 0
NoLdsWriteCode: false
NoReject: false
NoTailLoop: false
NonTemporalA: 0
NonTemporalB: 0
NonTemporalC: 0
NonTemporalD: 0
NumElementsPerBatchStore: 0
NumElementsPerThread: 4
NumGlobalWriteVectorsPerThread: 4
NumLoadsA: 1
NumLoadsB: 1
NumLoadsCoalescedA: 1
NumLoadsCoalescedB: 1
NumLoadsPerpendicularA: 1
NumLoadsPerpendicularB: 1
NumThreads: 256
OptNoLoadLoop: 1
OptPreLoopVmcnt: 0
PackBatchDims: 0
PackFreeDims: 1
PackGranularity: 2
PackSummationDims: 0
PackedC0IdxChars: [I]
PackedC0IndicesX: [0]
PackedC1IdxChars: [J]
PackedC1IndicesX: [1]
PerformanceSyncLocation: -1
PerformanceWaitCount: -1
PerformanceWaitLocation: -1
PersistentKernel: 0
PersistentKernelAlongBatch: false
PrefetchAcrossPersistent: 0
PrefetchAcrossPersistentMode: 0
PrefetchGlobalRead: false
PrefetchLocalRead: true
ProblemType:
AllowNoFreeDims: false
AssignedDerivedParameters: true
Batched: true
ComplexConjugateA: false
ComplexConjugateB: false
ComputeDataType: 0
ConvolutionConfig: []
DataType: 0
DestDataType: 0
Fp16AltImpl: false
HighPrecisionAccumulate: false
Index0: 0
Index01A: 0
Index01B: 1
Index1: 1
IndexAssignmentsA: [0, 3, 2]
IndexAssignmentsB: [1, 3, 2]
IndexAssignmentsLD: [4, 5, 6, 7]
IndexUnroll: 3
IndexUnrollA: 1
IndexUnrollB: 1
IndicesBatch: [2]
IndicesFree: [0, 1]
IndicesSummation: [3]
MirrorDimsA: []
MirrorDimsB: []
NumIndicesBatch: 1
NumIndicesC: 3
NumIndicesFree: 2
NumIndicesLD: 4
NumIndicesSummation: 1
OperationType: GEMM
SetConstStrideA: []
SetConstStrideB: []
SilentHighPrecisionAccumulate: false
StridedBatched: true
TLUA: true
TLUB: true
Tensor0: 0
Tensor1: 1
TileA: 0
TileAwareSelection: false
TileB: 1
TotalIndices: 4
TransposeA: false
TransposeB: true
UseBeta: true
UseInitialStridesAB: false
UseInitialStridesCD: false
ZeroPadA: []
ZeroPadB: []
ReplacementKernel: false
ScheduleGlobalRead: 1
ScheduleIterAlg: 1
ScheduleLocalWrite: 1
SolutionIndex: 0
SolutionNameMin: Cijk_Ailk_Bjlk_SB_MT32x32x8_SN_
SourceSwap: false
StaggerU: 32
StaggerUMapping: 0
StaggerUStride: 256
StoreCInUnroll: false
StoreCInUnrollExact: false
StoreCInUnrollInterval: 1
StoreCInUnrollPostLoop: false
StorePriorityOpt: false
StoreRemapVectorWidth: 0
StoreSyncOpt: 0
StoreVectorWidth: 4
SubGroup0: 16
SubGroup1: 16
SubGroupA: 16
SubGroupB: 16
SuppressNoLoadLoop: false
ThreadTile: [2, 2]
ThreadTile0: 2
ThreadTile1: 2
ThreadTileA: 2
ThreadTileB: 2
TransposeLDS: 0
UnrollIncIsDepthU: 0
UnrollMajorLDSA: 0
UnrollMajorLDSB: 0
UnrollMemFence: false
Use64bShadowLimit: 1
UseInstOffsetForGRO: 0
UseSgprForGRO: -1
Valid: true
VectorAtomicWidth: 1
VectorStore: -1
VectorWidth: 1
WaveSeparateGlobalReadA: 0
WaveSeparateGlobalReadB: 0
WavefrontSize: 64
WorkGroup: [16, 16, 1]
WorkGroupMapping: 8
WorkGroupMappingType: B
_DepthULds: 8
_GlobalAccumulation: null
_UseSgprForGRO: 1
_VectorStore: 1
_WorkspaceSizePerElemC: 0
_staggerStrideShift: 3
allowLRVWforTLUandMI: false
- [2, 3, 0, 1]
- - - [126, 126, 2, 66, 126, 126, 126, 126]
- [0, 0]
- null
- null
- DeviceEfficiency
Loading